Beispiel #1
0
def perform_classification (corpus_dir, extn, embedding_fname, class_labels_fname):
    gensim_model = gensim.models.KeyedVectors.load_word2vec_format(fname=embedding_fname)
    logging.info('Loaded gensim model of subgraph vectors')

    subgraph_vocab = sorted(gensim_model.vocab.keys())
    logging.info('Vocab consists of {} subgraph features'.format(len(subgraph_vocab)))

    wlk_files = get_files(corpus_dir, extn)
    logging.info('Loaded {} graph WL kernel files for performing classification'.format(len(wlk_files)))
    c_vectorizer = CountVectorizer(input='filename',
                                   tokenizer=subgraph2vec_tokenizer,
                                   lowercase=False,
                                   vocabulary=subgraph_vocab)
    normalizer = Normalizer()

    X = c_vectorizer.fit_transform(wlk_files)
    X = normalizer.fit_transform(X)
    logging.info('X (sample) matrix shape: {}'.format(X.shape))


    Y = np.array(get_class_labels(wlk_files, class_labels_fname))
    logging.info('Y (label) matrix shape: {}'.format(Y.shape))

    seed = randint(0, 1000)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=seed)
    logging.info('Train and Test matrix shapes: {}, {}, {}, {} '.format(X_train.shape, X_test.shape,
                                                                        Y_train.shape, Y_test.shape))

    linear_kernel_svm_classify(X_train, X_test, Y_train, Y_test)

    subgraph_kernel = get_subgraph_kernel (gensim_model, subgraph_vocab)
    deep_kernel_svm_classify (X_train, X_test, Y_train, Y_test, subgraph_kernel)
Beispiel #2
0
def perform_classification(corpus_dir, extension, embedding_fname,
                           class_labels_fname):
    """
	Perform classification from 
	:param corpus_dir: folder containing subgraph2vec sentence files
	:param extension: extension of the subgraph2vec sentence files
	:param embedding_fname: file containing subgraph vectors in word2vec format
	:param class_labels_fname: files containing labels of each graph
	:return:None
	"""

    # weisfeiler lehman kernel files
    wlk_files = get_files(corpus_dir, extension)

    Y = np.array(get_class_labels(wlk_files, class_labels_fname))
    logging.info('Y (label) matrix shape: {}'.format(Y.shape))

    seed = randint(0, 1000)
    with open(embedding_fname, 'r') as fh:
        graph_embedding_dict = json.load(fh)
    X = np.array([graph_embedding_dict[fname] for fname in wlk_files])

    X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                        Y,
                                                        test_size=0.1,
                                                        random_state=seed)
    logging.info('Training and Test Matrix Shapes: {}. {}. {}. {} '.format(
        X_train.shape, X_test.shape, Y_train.shape, Y_test.shape))

    scores = rbf_svm_classify(X_train, X_test, Y_train, Y_test)
    return scores
Beispiel #3
0
def main():

    # load up the SVM stored from prior training

    try:
        svm = cv2.ml.SVM_load(params.HOG_SVM_PATH_SAVED)
    except:
        print("Missing files  SVM")
        print("-- have you performed training to produce this file ?")
        exit()

    # load ** testing ** data sets in the same class order as training
    # (here we perform patch sampling only from the centre of the +ve
    # class and only a single sample is taken
    # hence [0,0] sample sizes and [False,True] centre weighting flags)

    print("Loading test data as a batch ...")

    paths = [params.DATA_testing_path_neg, params.DATA_testing_path_pos]
    use_centre_weighting = [False, True]
    class_names = params.DATA_CLASS_NAMES
    imgs_data = utils.load_images(paths, class_names, [0, 0],
                                  use_centre_weighting)

    print("Computing HOG descriptors...")  # for each testing image
    start = cv2.getTickCount()
    [img_data.compute_hog_descriptor() for img_data in imgs_data]
    utils.print_duration(start)

    # get the example/sample HOG descriptors and class labels

    samples, class_labels = utils.get_hog_descriptors(
        imgs_data), utils.get_class_labels(imgs_data)

    # perform batch SVM classification over the whole set

    print("Performing batch SVM classification over all data  ...")

    results = svm.predict(samples)
    output = results[1].ravel()

    # compute and report the error over the whole set

    error = ((np.absolute(class_labels.ravel() - output).sum()) /
             float(output.shape[0]))
    print("Successfully trained SVM with {}% testing set error".format(
        round(error * 100, 2)))
    print(
        "-- meaining the SVM got {}% of the testing examples correct!".format(
            round((1.0 - error) * 100, 2)))
Beispiel #4
0
def cross_val_accuracy(corpus_dir,
                       extension,
                       embedding_fname,
                       class_labels_fname,
                       cv=10,
                       mode=None):
    """
	Performs 10 (default) fold cross validation, returns the mean accuracy and associated 
	standard deviation

	:param corpus_dir: folder containing subgraph2vec sentence files
	:param extension: extension of the subgraph2vec sentence files
	:param embedding_fname: file containing subgraph vectors in word2vec format
	:param class_labels_fname: files containing labels of each graph
	:param cv: integer stating number of folds and therefore experiments to carry out
	"""
    # our accuracies
    acc_results = []

    # weisfeiler lehman kernel files
    wlk_files = get_files(corpus_dir, extension)

    Y = np.array(get_class_labels(wlk_files, class_labels_fname))
    logging.info('Y (label) matrix shape: {}'.format(Y.shape))

    for i in range(cv):
        seed = randint(0, 1000)
        with open(embedding_fname, 'r') as fh:
            graph_embedding_dict = json.load(fh)
        X = np.array([graph_embedding_dict[fname] for fname in wlk_files])

        X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                            Y,
                                                            test_size=0.1,
                                                            random_state=seed)

        if mode == "linear":
            scores = linear_svm_classify(X_train, X_test, Y_train, Y_test)
        else:
            scores = rbf_svm_classify(X_train, X_test, Y_train, Y_test)

        acc_results.append(scores[0])

    return np.mean(acc_results), np.std(acc_results)
def perform_classification (corpus_dir, extn, embedding_fname, class_labels_fname):
    '''
    Perform classification from
    :param corpus_dir: folder containing subgraph2vec sentence files
    :param extn: extension of subgraph2vec sentence files
    :param embedding_fname: file containing subgraph vectors in word2vec format (refer Mikolov et al (2013) code)
    :param class_labels_fname: files containing labels of each graph
    :return: None
    '''

    wlk_files = get_files(corpus_dir, extn)
    logging.info('Loaded {} graph WL kernel files for performing classification'.format(len(wlk_files)))
    c_vectorizer = CountVectorizer(input='filename',
                                   tokenizer=subgraph2vec_tokenizer,
                                   lowercase=False)
    normalizer = Normalizer()

    X = c_vectorizer.fit_transform(wlk_files)
    X = normalizer.fit_transform(X)
    logging.info('X (sample) matrix shape: {}'.format(X.shape))


    Y = np.array(get_class_labels(wlk_files, class_labels_fname))
    logging.info('Y (label) matrix shape: {}'.format(Y.shape))

    seed = randint(0, 1000)

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=seed)
    logging.info('Train and Test matrix shapes: {}, {}, {}, {} '.format(X_train.shape, X_test.shape,
                                                                        Y_train.shape, Y_test.shape))

    linear_svm_classify(X_train, X_test, Y_train, Y_test)


    with open(embedding_fname,'r') as fh:
        graph_embedding_dict = json.load(fh)
    X = np.array([graph_embedding_dict[fname] for fname in wlk_files])
    # X = normalizer.fit_transform(X)

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=seed)
    logging.info('Train and Test matrix shapes: {}, {}, {}, {} '.format(X_train.shape, X_test.shape,
                                                                        Y_train.shape, Y_test.shape))

    linear_svm_classify(X_train, X_test, Y_train, Y_test)
Beispiel #6
0
def perform_classification (corpus_dir, extn, embedding_fname, class_labels_fname):
    '''
    Perform classification from
    :param corpus_dir: folder containing subgraph2vec sentence files
    :param extn: extension of subgraph2vec sentence files
    :param embedding_fname: file containing subgraph vectors in word2vec format (refer Mikolov et al (2013) code)
    :param class_labels_fname: files containing labels of each graph
    :return: None
    '''
    gensim_model = gensim.models.KeyedVectors.load_word2vec_format(fname=embedding_fname)
    logging.info('Loaded gensim model of subgraph vectors')

    subgraph_vocab = sorted(gensim_model.vocab.keys())
    logging.info('Vocab consists of {} subgraph features'.format(len(subgraph_vocab)))

    wlk_files = get_files(corpus_dir, extn)
    logging.info('Loaded {} graph WL kernel files for performing classification'.format(len(wlk_files)))
    c_vectorizer = CountVectorizer(input='filename',
                                   tokenizer=subgraph2vec_tokenizer,
                                   lowercase=False,
                                   vocabulary=subgraph_vocab)
    normalizer = Normalizer()

    X = c_vectorizer.fit_transform(wlk_files)
    X = normalizer.fit_transform(X)
    logging.info('X (sample) matrix shape: {}'.format(X.shape))


    Y = np.array(get_class_labels(wlk_files, class_labels_fname))
    logging.info('Y (label) matrix shape: {}'.format(Y.shape))

    seed = randint(0, 1000)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=seed)
    logging.info('Train and Test matrix shapes: {}, {}, {}, {} '.format(X_train.shape, X_test.shape,
                                                                        Y_train.shape, Y_test.shape))

    linear_kernel_svm_classify(X_train, X_test, Y_train, Y_test)

    subgraph_kernel = get_subgraph_kernel (gensim_model, subgraph_vocab)
    deep_kernel_svm_classify (X_train, X_test, Y_train, Y_test, subgraph_kernel)
Beispiel #7
0
def perform_classification(corpus_dir, extn, embeddings, class_labels_fname):
    '''
    Perform classification from
    :param corpus_dir: folder containing subgraph2vec sentence files
    :param extn: extension of subgraph2vec sentence files
    :param embedding_fname: file containing subgraph vectors in word2vec format (refer Mikolov et al (2013) code)
    :param class_labels_fname: files containing labels of each graph
    :return: None
    '''

    wlk_files = get_files(corpus_dir, extn)

    Y = np.array(get_class_labels(wlk_files, class_labels_fname))
    # logging.info('Y (label) matrix shape: {}'.format(Y.shape))

    seed = randint(0, 1000)

    # with open(embedding_fname,'r') as fh:
    # graph_embedding_dict = json.load(fh)

    wlk_files = [os.path.basename(x) for x in wlk_files]
    # graph_embedding_dict = {os.path.basename(x):y for x, y in graph_embedding_dict.iteritems()}

    # X = np.array([graph_embedding_dict[fname] for fname in wlk_files])
    X = embeddings

    from sklearn.model_selection import StratifiedKFold
    kf = StratifiedKFold(10, shuffle=True, random_state=None)
    accs = []
    for train_index, test_index in kf.split(X, Y):

        X_train, X_test = X[train_index], X[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]
        # logging.info('Train and Test matrix shapes: {}, {}, {}, {} '.format(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape))

        acc = linear_svm_classify(X_train, X_test, Y_train, Y_test)
        accs.append(acc)
    print(np.mean(accs), np.std(accs))
    return np.mean(accs)
def main():

    ############################################################################
    # load our training data set of images examples

    program_start = cv2.getTickCount()

    print("Loading images...")
    start = cv2.getTickCount()

    # N.B. specify data path names in same order as class names (neg, pos)

    paths = [params.DATA_training_path_neg, params.DATA_training_path_pos]

    # build a list of class names automatically from our dictionary of class (name,number) pairs

    class_names = [utils.get_class_name(class_number) for class_number in range(len(params.DATA_CLASS_NAMES))]

    # specify number of sub-window samples to take from each positive and negative
    # example image in the data set
    # N.B. specify in same order as class names (neg, pos) - again

    sampling_sizes = [params.DATA_training_sample_count_neg, params.DATA_training_sample_count_pos]

    # do we want to take samples only centric to the example image or ramdonly?
    # No - for background -ve images (first class)
    # Yes - for object samples +ve images (second class)

    sample_from_centre = [False, True];

    # perform image loading

    imgs_data = utils.load_images(paths, class_names, sampling_sizes, sample_from_centre,
                            params.DATA_WINDOW_OFFSET_FOR_TRAINING_SAMPLES, params.DATA_WINDOW_SIZE);

    print(("Loaded {} image(s)".format(len(imgs_data))))
    utils.print_duration(start)

    ############################################################################
    # perform HOG feature extraction

    print("Computing HOG descriptors...") # for each training image
    start = cv2.getTickCount()
    #each HoG descriptor is stored in its respective img_data instance
    [img_data.compute_hog_descriptor() for img_data in imgs_data]
    utils.print_duration(start)

    ############################################################################
    # train an SVM based on these norm_features

    print("Training SVM...")
    start = cv2.getTickCount()

    # define SVM parameters
    svm = cv2.ml.SVM_create()
    svm.setType(cv2.ml.SVM_C_SVC)           # set SVM type
    svm.setKernel(params.HOG_SVM_kernel)    # use specific kernel type

    # get hog descriptor for each image and store in single global array
    samples = utils.get_hog_descriptors(imgs_data)

    # get class label for each training image (i.e. 0 for other, 1 for pedestrian... can extend)
    class_labels = utils.get_class_labels(imgs_data);

    # specify the termination criteria for the SVM training
    svm.setTermCriteria((cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_COUNT, params.HOG_SVM_max_training_iterations, 1.e-06))

    # perform auto training for the SVM which will essentially perform grid
    # search over the set of parameters for the chosen kernel and the penalty
    # cost term, C (N.B. trainAuto() syntax is correct as of OpenCV 3.4.x)
    svm.trainAuto(samples, cv2.ml.ROW_SAMPLE, class_labels, kFold = 10, balanced = True);

    # save the trained SVM to file so that we can load it again for testing / detection
    svm.save(params.HOG_SVM_PATH_TRAIN)

    ############################################################################
    # measure performance of the SVM trained on the bag of visual word features

    # perform prediction over the set of examples we trained over
    output = svm.predict(samples)[1].ravel()
    error = (np.absolute(class_labels.ravel() - output).sum()) / float(output.shape[0])

    # we are succesful if our prediction > than random
    # e.g. for 2 class labels this would be 1/2 = 0.5 (i.e. 50%)
    if error < (1.0 / len(params.DATA_CLASS_NAMES)):
        print("Trained SVM obtained {}% training set error".format(round(error * 100,2)))
        print("-- meaining the SVM got {}% of the training examples correct!".format(round((1.0 - error) * 100,2)))
    else:
        print("Failed to train SVM. {}% error".format(round(error * 100,2)))

    utils.print_duration(start)

    print(("Finished training HoG detector. {}".format(format_time(get_elapsed_time(program_start)))))