def load_page_data_ls12(data_folder, filenames):
    """
    Loads keypoints and labels for each file in filenames. data_folder must contain three
    files per entry in 'filenames': <filename>.keypoints_x, <filename>.keypoints_y,
    <filename>.vw.

    @param data_folder: Folder containing text files for each page in filenames.
    @param filenames: List containing page filenames WITHOUT file-extension.
    @return: Keypoints-list, labels-list and page-sizes-list.
    """
    keypoints = []
    labels = []
    for f in filenames:
        x = []
        y = []
        with open("{}/{}.keypoints_x".format(data_folder, f), "r") as infile:
            x_list = infile.readlines()
            x.extend([int(float(n.strip())) for n in x_list])
        with open("{}/{}.keypoints_y".format(data_folder, f), "r") as infile:
            y_list = infile.readlines()
            y.extend([int(float(n.strip())) for n in y_list])
        keypoints.append(np.array([x, y]).T)
        with open("{}/{}.vw".format(data_folder, f), "r") as infile:
            vw_list = infile.readlines()
            labels.append([int(float(n.strip()))-1 for n in vw_list])
        log.d("Read {} keypoints and labels for page '{}'".format(len(y_list), f))
    return keypoints, labels
def __evaluate_results(top_results, query_word, query_occurences, logresults=True):
    """
    Calculate key figures from query results.

    @param top_results: List containing best results sorted ascending by distance to query feature vector.
        Contains tuples with format (wordstring, distance to query).
    @param query_word: Word that was searched.
    @param logresults: If True, also print recall, precision and average precision.
    @return: Average Precision, Recall, Precision
    """
    if query_occurences == 0:
        if logresults:
            log.d("Queryword {} is not represented in searched text.".format(query_word))
        return None, None, None
    num_results = len(top_results)
    found = 0
    it_num = 1
    top = 0
    for _, word, dist in top_results:
        if word == query_word:
            found += 1
            top += float(found)/it_num
        it_num += 1
    ap = 100 * float(top) / found if found != 0 else 0.0
    recall = 100 * float(found) / query_occurences
    precision = 100 * float(found) / num_results
    if logresults:
        __log_results("Query: '{}'".format(query_word), recall, precision, ap)
    return ap, recall, precision
    def build_feature_vectors_matrix(self, trans_data_pages, keypoint_data_pages, label_data_pages, step_size):
        """
        Builds feature matrix containing one row per word image and one column per feature.

        @param trans_data_pages: List of pages. Each page is a list containing TransData objects for each word.
        @param keypoints: List of all keypoints (concatenated from each page).
        @param labels: List of all labels (concatenated from each page).
        @param step_size: Distance between keypoints.
        @return: Matrix containing one row per word and one column per feature.
        """
        label_matrices = self.__build_pages_matrices(keypoint_data_pages, label_data_pages)
        num_pages = len(label_matrices)
        num_rows = sum([len(t) for t in trans_data_pages])
        feat_mat = np.zeros(shape=(num_rows, self.spatial_pyramid.descriptor_size()))
        i = 0
        for page_idx in range(num_pages):
            keypoints = keypoint_data_pages[page_idx]
            label_matrix = label_matrices[page_idx]
            for word_data in trans_data_pages[page_idx]:
                origin = keypoints[0]
                x = math.ceil((word_data.xstart - origin[0]) / float(step_size))
                y = math.ceil((word_data.ystart - origin[1]) / float(step_size))
                dx = math.floor(word_data.width / float(step_size))
                dy = math.floor(word_data.height / float(step_size))
                # in terms of matrix notation, y is row here!
                # +1 is necessary, because in python 1:5 is 1,2,3,4!
                desc_mat = label_matrix[int(y):int(y+dy+1), int(x):int(x+dx+1)]
                # print(desc_mat.shape)
                visual_descriptor = self.spatial_pyramid.calculate_descriptor_from_mat(desc_mat)
                feat_mat[i] = visual_descriptor
                # fw_matrix[i] = mathutils.normalize(fw_matrix[i])
                i += 1
        log.d("Visual feature-matrix has shape {}".format(feat_mat.shape))
        return feat_mat
def precalculate_vis(nviscs, vsps):
    step_size = 5

    # contains .npy files for each page
    train_page_data_folder = "{}/gw_ls12/page_data".format(ROOT_FOLDER)
    # contains word image annotations for each page
    train_page_trans_folder = "{}/gw_ls12/GT".format(ROOT_FOLDER)
    filenames = ['2700270', '2710271', '2720272', '2730273', '2740274',
                 '2750275', '2760276', '2770277', '2780278', '2790279',
                 '3000300', '3010301', '3020302', '3030303', '3040304',
                 '3050305', '3060306', '3070307', '3080308', '3090309']

    trans_data_pages = trans.load_transcription_data(train_page_trans_folder, filenames)

    for nvisc in nviscs:
        log.d("Loading data for {} training pages...".format(len(filenames)))
        page_data_files = ["{}/{}/{}.npy".format(train_page_data_folder, nvisc, f) for f in filenames]
        keypoint_data_pages, label_data_pages = visimport.load_page_data(page_data_files)

        for vsp in vsps:
            spatial_pyramid = SpatialPyramid(vsp, nvisc)
            vis_feat_mat = BofGenerator(spatial_pyramid).build_feature_vectors_matrix(trans_data_pages,
                                                                                      keypoint_data_pages,
                                                                                      label_data_pages,
                                                                                      step_size)

            outname = __gw_param_config_string(nvisc, vsp) + ".txt.gz"
            log.d("Saving {}".format(outname))
            np.savetxt("{}/{}".format(OUT_FOLDER, outname), vis_feat_mat)
def precalculate_unipen(nonlcs, osps):
    unipen_folders = ["aeb", "asl", "ben", "cb", "ckb", "dlm", "etb", "gl", "ja", "jdc", "jhc", "jma", "kaj", "kew",
                      "ksc", "lac", "lcf", "mek", "mml", "mmm", "nco", "pm", "rn", "rv", "sbc", "scd", "sd", "sij",
                      "skh", "skw", "srs"]
    # we need to filter out the few word occurences in the chosen unipen subset that contain uppercase letters
    contains_uppercase = lambda t: any([c in t for c in "ABCDEFGHIJKLMNOPQRSTUVWXYZ"])
    unipen_traj_keypoint_paths = ["{}/unipen/keypointss/sta0.hpb0-{}/{}".format(ROOT_FOLDER, upfol, t)
                        for upfol in unipen_folders
                        for t in sorted(os.listdir("{}/unipen/keypointss/sta0.hpb0-{}".format(ROOT_FOLDER, upfol)))
                        if not contains_uppercase(t)]

    for nonlc in nonlcs:
        unipen_traj_label_paths = [p.replace("keypointss", "labels/{}".format(nonlc)).replace("keypoints", "ow") for p in unipen_traj_keypoint_paths]
        unipen_traj_keypoint_data = [trajimport.read_traj_keypoints(p) for p in unipen_traj_keypoint_paths]
        unipen_traj_label_data = [trajimport.read_traj_labels(p) for p in unipen_traj_label_paths]

        for osp in osps:
            print("")
            log.d("[Loading Trainingdata]")

            traj_spatial_pyramid = SpatialPyramid(osp, nonlc)
            test_boof_generator = BoofGenerator(traj_spatial_pyramid)
            traj_feat_mat = test_boof_generator.build_feature_vectors_matrix(unipen_traj_keypoint_data, unipen_traj_label_data)

            outname = __unipen_param_config_string(nonlc, osp) + ".txt.gz"
            log.d("Saving {}".format(outname))
            np.savetxt("{}/{}".format(OUT_FOLDER, outname), traj_feat_mat)
def precalculate_gwo(nonlcs, osps):
    filenames = ['2700270', '2710271', '2720272', '2730273', '2740274',
                 '2750275', '2760276', '2770277', '2780278', '2790279',
                 '3000300', '3010301', '3020302', '3030303', '3040304',
                 '3050305', '3060306', '3070307', '3080308', '3090309']

    gwo_traj_folder = "{}/gw_online".format(ROOT_FOLDER)
    gwo_traj_keypoint_paths = []
    for folder in filenames:
        path = "{}/keypointss/{}".format(gwo_traj_folder, folder)
        traj_names = sorted(file for file in os.listdir(path) if file.endswith(".keypoints"))
        gwo_traj_keypoint_paths.extend(["{}/{}".format(path, traj) for traj in traj_names])

    for nonlc in nonlcs:
        gwo_traj_label_paths = [p.replace("keypointss", "labels/{}".format(nonlc)).replace("keypoints", "ow") for p in gwo_traj_keypoint_paths]
        gwo_traj_keypoint_data = [trajimport.read_traj_keypoints(p) for p in gwo_traj_keypoint_paths]
        gwo_traj_label_data = [trajimport.read_traj_labels(p) for p in gwo_traj_label_paths]

        for osp in osps:
            print("")
            log.d("[Loading Trainingdata]")

            traj_spatial_pyramid = SpatialPyramid(osp, nonlc)
            test_boof_generator = BoofGenerator(traj_spatial_pyramid)
            traj_feat_mat = test_boof_generator.build_feature_vectors_matrix(gwo_traj_keypoint_data, gwo_traj_label_data)

            outname = __gwo_param_config_string(nonlc, osp) + ".txt.gz"
            log.d("Saving {}".format(outname))
            np.savetxt("{}/{}".format(OUT_FOLDER, outname), traj_feat_mat)
def __evaluate_results(top_results,
                       query_word,
                       query_occurences,
                       logresults=True):
    """
    Calculate key figures from query results.

    @param top_results: List containing best results sorted ascending by distance to query feature vector.
        Contains tuples with format (wordstring, distance to query).
    @param query_word: Word that was searched.
    @param logresults: If True, also print recall, precision and average precision.
    @return: Average Precision, Recall, Precision
    """
    if query_occurences == 0:
        if logresults:
            log.d("Queryword {} is not represented in searched text.".format(
                query_word))
        return None, None, None
    num_results = len(top_results)
    found = 0
    it_num = 1
    top = 0
    for _, word, dist in top_results:
        if word == query_word:
            found += 1
            top += float(found) / it_num
        it_num += 1
    ap = 100 * float(top) / found if found != 0 else 0.0
    recall = 100 * float(found) / query_occurences
    precision = 100 * float(found) / num_results
    if logresults:
        __log_results("Query: '{}'".format(query_word), recall, precision, ap)
    return ap, recall, precision
Example #8
0
def load_page_data_ls12(data_folder, filenames):
    """
    Loads keypoints and labels for each file in filenames. data_folder must contain three
    files per entry in 'filenames': <filename>.keypoints_x, <filename>.keypoints_y,
    <filename>.vw.

    @param data_folder: Folder containing text files for each page in filenames.
    @param filenames: List containing page filenames WITHOUT file-extension.
    @return: Keypoints-list, labels-list and page-sizes-list.
    """
    keypoints = []
    labels = []
    for f in filenames:
        x = []
        y = []
        with open("{}/{}.keypoints_x".format(data_folder, f), "r") as infile:
            x_list = infile.readlines()
            x.extend([int(float(n.strip())) for n in x_list])
        with open("{}/{}.keypoints_y".format(data_folder, f), "r") as infile:
            y_list = infile.readlines()
            y.extend([int(float(n.strip())) for n in y_list])
        keypoints.append(np.array([x, y]).T)
        with open("{}/{}.vw".format(data_folder, f), "r") as infile:
            vw_list = infile.readlines()
            labels.append([int(float(n.strip())) - 1 for n in vw_list])
        log.d("Read {} keypoints and labels for page '{}'".format(
            len(y_list), f))
    return keypoints, labels
def learn_embedded_attributes(data_mat, phoc_mat):
    """
    Learns one SVM for each PHOC-attribute.

    @param data_mat: Each row is a feature vector.
    @param phoc_mat: Each row is a phoc vector.
    @return: One SVM per PHOC attribute.
    """
    num_attributes = phoc_mat.shape[1]
    svms = np.empty(num_attributes, dtype=object)
    invalid = 0
    log.d("Training SVMs for {} attributes...".format(num_attributes))
    for att in range(num_attributes):
        log.update_progress(att + 1, num_attributes)
        labels = phoc_mat[:, att]
        # if we have only one class (either 1 or 0) we can't train a svm
        if sum(labels) == 0 or sum(labels) == num_attributes:
            svms[att] = None
            invalid += 1
            continue
        clf = svm.LinearSVC()
        clf.fit(data_mat, labels)
        svms[att] = clf
    print("")
    log.d("{} invalid attributes".format(invalid))
    return svms
def extract_ow():
    filenames = [
        "{}/{}/{}".format(RAW_TRAJ_FOLDER, dir, f)
        for dir in os.listdir(RAW_TRAJ_FOLDER)
        if os.path.isdir("{}/{}".format(RAW_TRAJ_FOLDER, dir))
        for f in os.listdir("{}/{}".format(RAW_TRAJ_FOLDER, dir))
        if f.endswith(".txt")
    ]

    log.d("Loading trajectories and calculating feature vectors...")
    feature_vectors = []
    word_sizes = []
    points_normed = []
    for i, f in enumerate(filenames):
        traj, _ = read_trajectory_from_file(f)
        traj_normed = norm.normalize_trajectory(traj, NORM_ARGS)
        write_keypoints(traj_normed, f, ROOT_FOLDER)
        feat_vec = feat.calculate_feature_vector_sequence(
            traj_normed, FEAT_ARGS)
        feature_vectors.extend(feat_vec)
        word_sizes.append(len(traj_normed))
        points_normed.extend([[int(n) for n in p] for p in traj_normed])
        log.update_progress(i + 1, len(filenames))
    print("")
    log.d("Accumulated {} feature vectors.".format(len(feature_vectors)))
    label_offsets = [0] + list(mathutils.accumulate(word_sizes))

    # cluster feature vectors
    log.d("Using Lloyd's algorithm to find {} clusters...".format(N_CENTROIDS))
    clusters, labels = clustering.cluster(np.array(feature_vectors),
                                          N_CENTROIDS)

    # create output dir
    try:
        os.makedirs("{}/unipen/clusters/{}".format(ROOT_FOLDER, N_CENTROIDS))
    except:
        pass

    # write codebook
    with open(
            "{}/unipen/clusters/{}/clusters_online_up_17feat.txt".format(
                ROOT_FOLDER, N_CENTROIDS), "w") as outfile:
        log.d("Writing {} clusters...".format(clusters.shape[0]))
        for c in clusters:
            outfile.write(' '.join([str(feature) for feature in c]) + '\n')

    # save labels for each trajectory
    for idx, name in enumerate(filenames):
        out_writer_dir, fname = keypoints_outfile_for_inpath(
            name, N_CENTROIDS, ROOT_FOLDER)
        log.d("Writing {}.ow".format(fname))
        with open("{}/{}.ow".format(out_writer_dir, fname), "w") as outfile:
            start_label_idx = label_offsets[idx]
            end_label_idx = label_offsets[idx + 1]
            print("... indices {} to {}".format(start_label_idx,
                                                end_label_idx))
            current_labels = labels[start_label_idx:end_label_idx]
            outfile.write('\n'.join([str(item) for item in current_labels]))
 def save_to_file(self, path):
     log.d("Saving svms to {}".format(path))
     f = file(path, "wb")
     np.save(f, self.attribute_svms)
     np.save(f, self.platts)
     np.save(f, self.transform)
     if self.platts:
         np.save(f, self.sigmoid_params)
     f.close()
def run_evaluation(query_mat,
                   query_words,
                   corpus_mat,
                   corpus_words,
                   drop_first=False):
    """
    Using each feature vector (row) in query_mat once as query, run comparisons with all
    feature vectors (rows) in corpus_mat.
    """
    num_queries = len(query_mat)
    log.d("Running {} queries...".format(num_queries))
    recall = 0.0
    precision = 0.0
    avg_precision = 0.0
    up = log.update_progress
    gtr = get_top_results
    er = __evaluate_results
    invalid = 0
    for i, query_vec in enumerate(query_mat):
        word = query_words[i]
        top_results = gtr(corpus_mat,
                          query_vec,
                          corpus_words,
                          corpus_mat.shape[0],
                          drop_first=drop_first)
        query_occurences = sum([1 for w in corpus_words if w == word])
        if drop_first:
            query_occurences -= 1
        ap, rec, prec = er(top_results,
                           word,
                           query_occurences,
                           logresults=False)
        if ap is None:
            # dont count invalid queries
            invalid += 1
        else:
            recall += rec
            precision += prec
            avg_precision += ap
        up(i + 1, num_queries)
    print("")
    num_queries -= invalid
    if num_queries == 0:
        log.e("No valid queries")
        return [0, 0, 0, 0]
    recall /= num_queries
    precision /= num_queries
    avg_precision /= num_queries

    __log_results("Results for {} queries:".format(num_queries), recall,
                  precision, avg_precision)

    return [num_queries, recall, precision, avg_precision]
Example #13
0
 def build_feature_vectors_matrix(self, keypoints, labels):
     log.d("Building trajectory feature matrix...")
     num_features = self.spatial_pyramid.descriptor_size()
     num_examples = len(keypoints)
     feat_mat = np.zeros(shape=(num_examples, num_features))
     i = 0
     for keyp, lab in zip(keypoints, labels):
         feat_mat[i] = self.__build_feature_vector(keyp, lab)
         i += 1
         log.update_progress(i+1, num_examples)
     print("")
     log.d("Accumulated {} feature vectors.".format(len(feat_mat)))
     return np.array(feat_mat)
 def build_feature_vectors_matrix(self, keypoints, labels):
     log.d("Building trajectory feature matrix...")
     num_features = self.spatial_pyramid.descriptor_size()
     num_examples = len(keypoints)
     feat_mat = np.zeros(shape=(num_examples, num_features))
     i = 0
     for keyp, lab in zip(keypoints, labels):
         feat_mat[i] = self.__build_feature_vector(keyp, lab)
         i += 1
         log.update_progress(i + 1, num_examples)
     print("")
     log.d("Accumulated {} feature vectors.".format(len(feat_mat)))
     return np.array(feat_mat)
def extract_ow():
    filenames = ["{}/{}/{}".format(RAW_TRAJ_FOLDER, dir, f)
                 for dir in os.listdir(RAW_TRAJ_FOLDER)
                 if os.path.isdir("{}/{}".format(RAW_TRAJ_FOLDER, dir))
                 for f in os.listdir("{}/{}".format(RAW_TRAJ_FOLDER, dir))
                 if f.endswith(".txt")]

    log.d("Loading trajectories and calculating feature vectors...")
    feature_vectors = []
    word_sizes = []
    points_normed = []
    for i, f in enumerate(filenames):
        traj, _ = read_trajectory_from_file(f)
        traj_normed = norm.normalize_trajectory(traj, NORM_ARGS)
        write_keypoints(traj_normed, f, ROOT_FOLDER)
        feat_vec = feat.calculate_feature_vector_sequence(traj_normed, FEAT_ARGS)
        feature_vectors.extend(feat_vec)
        word_sizes.append(len(traj_normed))
        points_normed.extend([[int(n) for n in p] for p in traj_normed])
        log.update_progress(i+1, len(filenames))
    print("")
    log.d("Accumulated {} feature vectors.".format(len(feature_vectors)))
    label_offsets = [0] + list(mathutils.accumulate(word_sizes))

    # cluster feature vectors
    log.d("Using Lloyd's algorithm to find {} clusters...".format(N_CENTROIDS))
    clusters, labels = clustering.cluster(np.array(feature_vectors), N_CENTROIDS)

    # create output dir
    try:
        os.makedirs("{}/unipen/clusters/{}".format(ROOT_FOLDER, N_CENTROIDS))
    except:
        pass

    # write codebook
    with open("{}/unipen/clusters/{}/clusters_online_up_17feat.txt".format(ROOT_FOLDER, N_CENTROIDS), "w") as outfile:
        log.d("Writing {} clusters...".format(clusters.shape[0]))
        for c in clusters:
            outfile.write(' '.join([str(feature) for feature in c]) + '\n')

    # save labels for each trajectory
    for idx, name in enumerate(filenames):
        out_writer_dir, fname = keypoints_outfile_for_inpath(name, N_CENTROIDS, ROOT_FOLDER)
        log.d("Writing {}.ow".format(fname))
        with open("{}/{}.ow".format(out_writer_dir, fname), "w") as outfile:
            start_label_idx = label_offsets[idx]
            end_label_idx = label_offsets[idx+1]
            print("... indices {} to {}".format(start_label_idx, end_label_idx))
            current_labels = labels[start_label_idx:end_label_idx]
            outfile.write('\n'.join([str(item) for item in current_labels]))
Example #16
0
def extract_ow():
    ma_data = "/home/chris/Work/MA.data"

    pages = ['2700270']  #, '2710271', '2720272', '2730273', '2740274',
    #'2750275', '2760276', '2770277', '2780278', '2790279',
    #'3000300', '3010301', '3020302', '3030303', '3040304',
    #'3050305', '3060306', '3070307', '3080308', '3090309']
    train_traj_folder = "{}/gw_online".format(ma_data)
    filenames = []
    for folder in pages:
        path = "{}/{}".format(train_traj_folder, folder)
        traj_names = sorted(file for file in os.listdir(path)
                            if file.endswith(".txt"))
        filenames.extend(["{}/{}".format(path, traj) for traj in traj_names])

    # load trajectories
    log.d("Loading trajectories and calculating feature vectors...")

    # cluster feature vectors
    word_sizes = []
    labels = []
    traj_clusters = trajimport.read_traj_clusters(CODEBOOK_FILE)
    for i, f in enumerate(filenames):
        traj, _ = read_trajectory_from_file(f)
        traj_normed = norm.normalize_trajectory(traj, NORM_ARGS)
        write_keypoints(traj_normed, f)
        feat_vec = feat.calculate_feature_vector_sequence(
            traj_normed, FEAT_ARGS)
        labels.extend(
            quantization.quantize_descriptors(feat_vec, traj_clusters))
        #feature_vectors.extend(feat_vec)
        word_sizes.append(len(traj_normed))
        #points_normed.extend([[int(n) for n in p] for p in traj_normed])
        log.update_progress(i + 1, len(filenames))
    print("")
    label_offsets = [0] + list(mathutils.accumulate(word_sizes))

    # save labels for each trajectory
    for idx, name in enumerate(filenames):
        out_writer_dir, fname = keypoints_outfile_for_inpath(name, N_CENTROIDS)
        log.d("Writing {}.ow".format(fname))
        with open("{}/{}.ow".format(out_writer_dir, fname), "w") as outfile:
            start_label_idx = label_offsets[idx]
            end_label_idx = label_offsets[idx + 1]
            print("... indices {} to {}".format(start_label_idx,
                                                end_label_idx))
            current_labels = labels[start_label_idx:end_label_idx]
            outfile.write('\n'.join([str(item) for item in current_labels]))
 def build_textual_feature_vectors_matrix(self, trans_data_pages=None):
     """
     Builds matrix containing one row per word and one column per feature.
     """
     if trans_data_pages == None:
         pages = self.trans_data_pages
     else:
         pages = trans_data_pages
     num_rows = sum([len(t) for t in pages])
     text_feat_mat = np.zeros(shape=(num_rows, self.feat_vec_size))
     i = 0
     for trans_page in pages:
         for word_data in trans_page:
             text_feat_mat[i] = self.build_textual_feature_vector(word_data.word)
             i += 1
     log.d("n-gram feature-vector-matrix has shape {}".format(text_feat_mat.shape))
     return text_feat_mat
def load_page_data(filenames):
    """
    Loads keypoints and labels for each file in filenames.

    @param filenames: List containing page filenames WITHOUT file-extension.
    @return: Keypoints-list, labels-list.
    """
    keypoints = []
    labels = []
    for f in filenames:
        with open(f, "r") as infile:
            k = np.load(infile)
            l = np.load(infile)
            keypoints.append(np.array(k))
            labels.append(np.array(l))
            log.d("Read {} keypoints and labels for page '{}'".format(len(k), f))
    return keypoints, labels
Example #19
0
def load_page_data(filenames):
    """
    Loads keypoints and labels for each file in filenames.

    @param filenames: List containing page filenames WITHOUT file-extension.
    @return: Keypoints-list, labels-list.
    """
    keypoints = []
    labels = []
    for f in filenames:
        with open(f, "r") as infile:
            k = np.load(infile)
            l = np.load(infile)
            keypoints.append(np.array(k))
            labels.append(np.array(l))
            log.d("Read {} keypoints and labels for page '{}'".format(
                len(k), f))
    return keypoints, labels
def extract_ow():
    ma_data = "/home/chris/Work/MA.data"

    pages = ['2700270']#, '2710271', '2720272', '2730273', '2740274',
             #'2750275', '2760276', '2770277', '2780278', '2790279',
             #'3000300', '3010301', '3020302', '3030303', '3040304',
             #'3050305', '3060306', '3070307', '3080308', '3090309']
    train_traj_folder = "{}/gw_online".format(ma_data)
    filenames = []
    for folder in pages:
        path = "{}/{}".format(train_traj_folder, folder)
        traj_names = sorted(file for file in os.listdir(path) if file.endswith(".txt"))
        filenames.extend(["{}/{}".format(path, traj) for traj in traj_names])

    # load trajectories
    log.d("Loading trajectories and calculating feature vectors...")

    # cluster feature vectors
    word_sizes = []
    labels = []
    traj_clusters = trajimport.read_traj_clusters(CODEBOOK_FILE)
    for i, f in enumerate(filenames):
        traj, _ = read_trajectory_from_file(f)
        traj_normed = norm.normalize_trajectory(traj, NORM_ARGS)
        write_keypoints(traj_normed, f)
        feat_vec = feat.calculate_feature_vector_sequence(traj_normed, FEAT_ARGS)
        labels.extend(quantization.quantize_descriptors(feat_vec, traj_clusters))
        #feature_vectors.extend(feat_vec)
        word_sizes.append(len(traj_normed))
        #points_normed.extend([[int(n) for n in p] for p in traj_normed])
        log.update_progress(i+1, len(filenames))
    print("")
    label_offsets = [0] + list(mathutils.accumulate(word_sizes))

    # save labels for each trajectory
    for idx, name in enumerate(filenames):
        out_writer_dir, fname = keypoints_outfile_for_inpath(name, N_CENTROIDS)
        log.d("Writing {}.ow".format(fname))
        with open("{}/{}.ow".format(out_writer_dir, fname), "w") as outfile:
            start_label_idx = label_offsets[idx]
            end_label_idx = label_offsets[idx+1]
            print("... indices {} to {}".format(start_label_idx, end_label_idx))
            current_labels = labels[start_label_idx:end_label_idx]
            outfile.write('\n'.join([str(item) for item in current_labels]))
Example #21
0
 def build_textual_feature_vectors_matrix(self, trans_data_pages=None):
     """
     Builds matrix containing one row per word and one column per feature.
     """
     if trans_data_pages == None:
         pages = self.trans_data_pages
     else:
         pages = trans_data_pages
     num_rows = sum([len(t) for t in pages])
     text_feat_mat = np.zeros(shape=(num_rows, self.feat_vec_size))
     i = 0
     for trans_page in pages:
         for word_data in trans_page:
             text_feat_mat[i] = self.build_textual_feature_vector(
                 word_data.word)
             i += 1
     log.d("PHOC feature-vector-matrix has shape {}".format(
         text_feat_mat.shape))
     return text_feat_mat
Example #22
0
def precalculate_gwo(nonlcs, osps):
    filenames = [
        '2700270', '2710271', '2720272', '2730273', '2740274', '2750275',
        '2760276', '2770277', '2780278', '2790279', '3000300', '3010301',
        '3020302', '3030303', '3040304', '3050305', '3060306', '3070307',
        '3080308', '3090309'
    ]

    gwo_traj_folder = "{}/gw_online".format(ROOT_FOLDER)
    gwo_traj_keypoint_paths = []
    for folder in filenames:
        path = "{}/keypointss/{}".format(gwo_traj_folder, folder)
        traj_names = sorted(file for file in os.listdir(path)
                            if file.endswith(".keypoints"))
        gwo_traj_keypoint_paths.extend(
            ["{}/{}".format(path, traj) for traj in traj_names])

    for nonlc in nonlcs:
        gwo_traj_label_paths = [
            p.replace("keypointss",
                      "labels/{}".format(nonlc)).replace("keypoints", "ow")
            for p in gwo_traj_keypoint_paths
        ]
        gwo_traj_keypoint_data = [
            trajimport.read_traj_keypoints(p) for p in gwo_traj_keypoint_paths
        ]
        gwo_traj_label_data = [
            trajimport.read_traj_labels(p) for p in gwo_traj_label_paths
        ]

        for osp in osps:
            print("")
            log.d("[Loading Trainingdata]")

            traj_spatial_pyramid = SpatialPyramid(osp, nonlc)
            test_boof_generator = BoofGenerator(traj_spatial_pyramid)
            traj_feat_mat = test_boof_generator.build_feature_vectors_matrix(
                gwo_traj_keypoint_data, gwo_traj_label_data)

            outname = __gwo_param_config_string(nonlc, osp) + ".txt.gz"
            log.d("Saving {}".format(outname))
            np.savetxt("{}/{}".format(OUT_FOLDER, outname), traj_feat_mat)
Example #23
0
def precalculate_unipen(nonlcs, osps):
    unipen_folders = [
        "aeb", "asl", "ben", "cb", "ckb", "dlm", "etb", "gl", "ja", "jdc",
        "jhc", "jma", "kaj", "kew", "ksc", "lac", "lcf", "mek", "mml", "mmm",
        "nco", "pm", "rn", "rv", "sbc", "scd", "sd", "sij", "skh", "skw", "srs"
    ]
    # we need to filter out the few word occurences in the chosen unipen subset that contain uppercase letters
    contains_uppercase = lambda t: any(
        [c in t for c in "ABCDEFGHIJKLMNOPQRSTUVWXYZ"])
    unipen_traj_keypoint_paths = [
        "{}/unipen/keypointss/sta0.hpb0-{}/{}".format(ROOT_FOLDER, upfol, t)
        for upfol in unipen_folders for t in sorted(
            os.listdir("{}/unipen/keypointss/sta0.hpb0-{}".format(
                ROOT_FOLDER, upfol))) if not contains_uppercase(t)
    ]

    for nonlc in nonlcs:
        unipen_traj_label_paths = [
            p.replace("keypointss",
                      "labels/{}".format(nonlc)).replace("keypoints", "ow")
            for p in unipen_traj_keypoint_paths
        ]
        unipen_traj_keypoint_data = [
            trajimport.read_traj_keypoints(p)
            for p in unipen_traj_keypoint_paths
        ]
        unipen_traj_label_data = [
            trajimport.read_traj_labels(p) for p in unipen_traj_label_paths
        ]

        for osp in osps:
            print("")
            log.d("[Loading Trainingdata]")

            traj_spatial_pyramid = SpatialPyramid(osp, nonlc)
            test_boof_generator = BoofGenerator(traj_spatial_pyramid)
            traj_feat_mat = test_boof_generator.build_feature_vectors_matrix(
                unipen_traj_keypoint_data, unipen_traj_label_data)

            outname = __unipen_param_config_string(nonlc, osp) + ".txt.gz"
            log.d("Saving {}".format(outname))
            np.savetxt("{}/{}".format(OUT_FOLDER, outname), traj_feat_mat)
    def build_feature_vectors_matrix(self, trans_data_pages,
                                     keypoint_data_pages, label_data_pages,
                                     step_size):
        """
        Builds feature matrix containing one row per word image and one column per feature.

        @param trans_data_pages: List of pages. Each page is a list containing TransData objects for each word.
        @param keypoints: List of all keypoints (concatenated from each page).
        @param labels: List of all labels (concatenated from each page).
        @param step_size: Distance between keypoints.
        @return: Matrix containing one row per word and one column per feature.
        """
        label_matrices = self.__build_pages_matrices(keypoint_data_pages,
                                                     label_data_pages)
        num_pages = len(label_matrices)
        num_rows = sum([len(t) for t in trans_data_pages])
        feat_mat = np.zeros(shape=(num_rows,
                                   self.spatial_pyramid.descriptor_size()))
        i = 0
        for page_idx in range(num_pages):
            keypoints = keypoint_data_pages[page_idx]
            label_matrix = label_matrices[page_idx]
            for word_data in trans_data_pages[page_idx]:
                origin = keypoints[0]
                x = math.ceil(
                    (word_data.xstart - origin[0]) / float(step_size))
                y = math.ceil(
                    (word_data.ystart - origin[1]) / float(step_size))
                dx = math.floor(word_data.width / float(step_size))
                dy = math.floor(word_data.height / float(step_size))
                # in terms of matrix notation, y is row here!
                # +1 is necessary, because in python 1:5 is 1,2,3,4!
                desc_mat = label_matrix[int(y):int(y + dy + 1),
                                        int(x):int(x + dx + 1)]
                # print(desc_mat.shape)
                visual_descriptor = self.spatial_pyramid.calculate_descriptor_from_mat(
                    desc_mat)
                feat_mat[i] = visual_descriptor
                # fw_matrix[i] = mathutils.normalize(fw_matrix[i])
                i += 1
        log.d("Visual feature-matrix has shape {}".format(feat_mat.shape))
        return feat_mat
def predict_embedded_attributes_labels(data_mat, svms):
    """
    Calculate class label predictions for each feature vector (=row) in data_mat.

    @return: Matrix with each column containing class labels for one feature vector.
    """
    num_attributes = len(svms)
    num_examples = data_mat.shape[0]
    A = np.zeros(shape=(num_attributes, num_examples))
    log.d("Classifying {} examples...".format(num_examples))
    for att_idx, svm in enumerate(svms):
        log.update_progress(att_idx + 1, num_attributes)
        if svm is not None:
            if sklearn.__version__ == '0.14.1':
                A[att_idx] = svm.predict(data_mat)
            else:
                # the return format of this function was changed in 0.15...
                A[att_idx] = svm.predict(data_mat).T
    print("")
    return A
def classify_embedded_attributes(data_mat, svms):
    """
    Calculate SVM-scores for each feature vector (=row) in data_mat.

    @return: Matrix with each column representing PHOC-transformation of one feature vector.
    """
    num_attributes = len(svms)
    num_examples = data_mat.shape[0]
    A = np.zeros(shape=(num_attributes, num_examples))
    log.d("Classifying {} examples...".format(num_examples))
    for att_idx, svm in enumerate(svms):
        update_progress(att_idx + 1, num_attributes)
        if svm is not None:
            if sklearn.__version__ == '0.14.1':
                A[att_idx] = svm.decision_function(data_mat)
            else:
                # the return format of this function was changed in 0.15...
                A[att_idx] = svm.decision_function(data_mat).T
    print("")
    return A
def run_evaluation(query_mat, query_words, corpus_mat, corpus_words, drop_first=False):
    """
    Using each feature vector (row) in query_mat once as query, run comparisons with all
    feature vectors (rows) in corpus_mat.
    """
    num_queries = len(query_mat)
    log.d("Running {} queries...".format(num_queries))
    recall = 0.0
    precision = 0.0
    avg_precision = 0.0
    up = log.update_progress
    gtr = get_top_results
    er = __evaluate_results
    invalid = 0
    for i, query_vec in enumerate(query_mat):
        word = query_words[i]
        top_results = gtr(corpus_mat, query_vec, corpus_words, corpus_mat.shape[0], drop_first=drop_first)
        query_occurences = sum([1 for w in corpus_words if w == word])
        if drop_first:
            query_occurences -= 1
        ap, rec, prec = er(top_results, word, query_occurences, logresults=False)
        if ap is None:
            # dont count invalid queries
            invalid += 1
        else:
            recall += rec
            precision += prec
            avg_precision += ap
        up(i+1, num_queries)
    print("")
    num_queries -= invalid
    if num_queries == 0:
        log.e("No valid queries")
        return [0, 0, 0, 0]
    recall /= num_queries
    precision /= num_queries
    avg_precision /= num_queries

    __log_results("Results for {} queries:".format(num_queries), recall, precision, avg_precision)

    return [num_queries, recall, precision, avg_precision]
Example #28
0
def precalculate_vis(nviscs, vsps):
    step_size = 5

    # contains .npy files for each page
    train_page_data_folder = "{}/gw_ls12/page_data".format(ROOT_FOLDER)
    # contains word image annotations for each page
    train_page_trans_folder = "{}/gw_ls12/GT".format(ROOT_FOLDER)
    filenames = [
        '2700270', '2710271', '2720272', '2730273', '2740274', '2750275',
        '2760276', '2770277', '2780278', '2790279', '3000300', '3010301',
        '3020302', '3030303', '3040304', '3050305', '3060306', '3070307',
        '3080308', '3090309'
    ]

    trans_data_pages = trans.load_transcription_data(train_page_trans_folder,
                                                     filenames)

    for nvisc in nviscs:
        log.d("Loading data for {} training pages...".format(len(filenames)))
        page_data_files = [
            "{}/{}/{}.npy".format(train_page_data_folder, nvisc, f)
            for f in filenames
        ]
        keypoint_data_pages, label_data_pages = visimport.load_page_data(
            page_data_files)

        for vsp in vsps:
            spatial_pyramid = SpatialPyramid(vsp, nvisc)
            vis_feat_mat = BofGenerator(
                spatial_pyramid).build_feature_vectors_matrix(
                    trans_data_pages, keypoint_data_pages, label_data_pages,
                    step_size)

            outname = __gw_param_config_string(nvisc, vsp) + ".txt.gz"
            log.d("Saving {}".format(outname))
            np.savetxt("{}/{}".format(OUT_FOLDER, outname), vis_feat_mat)
def __log_results(header, mr, mp, map):
    print("")
    log.d(header)
    log.d("mR  {0:.2f}%".format(mr))
    log.d("mP  {0:.2f}%".format(mp))
    log.d("mAP {0:.2f}%".format(map))
Example #30
0
def run_experiment():
    ma_data = "/home/cwieprec/Work/MA.data"
    precomputed = "{}/precomputed".format(ma_data)
    xval_num_folds = 4

    # list of dictionaries that contain all parameter combinations that will be run
    param_combs = [{
        "vw": 2048,
        "vsp": [[3, 2], [2, 1]],
        "ow": 512,
        "osp": [[9, 2], [3, 2]]
    }]

    # load annotation data
    gw_trans_data_pages = __get_gw_trans_data_pages(ma_data)

    # all word annotations in dataset
    gw_words = np.array([w.word for p in gw_trans_data_pages for w in p])

    num_pages = len(gw_trans_data_pages)
    accum_page_sizes = [0] + list(
        mathutils.accumulate([len(p) for p in gw_trans_data_pages]))
    num_samples = accum_page_sizes[-1]
    fold_size = num_pages / xval_num_folds

    best_map = 0.0
    best_param_comb = {}

    for param_comb in param_combs:
        nonlc = param_comb["ow"]
        osp = param_comb["osp"]
        nvisc = param_comb["vw"]
        vsp = param_comb["vsp"]

        log.d("Loading and concatenating feature matrices...")
        gwo_traj_feat_mat = np.loadtxt("{}/{}.txt.gz".format(
            precomputed, __gwo_param_config_string(nonlc, osp)))
        vis_feat_mat = np.loadtxt("{}/{}.txt.gz".format(
            precomputed, __gw_param_config_string(nvisc, vsp)))

        # build shared feature matrix
        feat_mat = np.concatenate((vis_feat_mat, gwo_traj_feat_mat), axis=1)

        stats = []
        n_visual_features = vis_feat_mat.shape[1]
        n_online_features = gwo_traj_feat_mat.shape[1]
        n_features = n_visual_features + n_online_features

        for test_fold in range(xval_num_folds):
            test_pages_indices = range(test_fold * fold_size,
                                       test_fold * fold_size + fold_size)
            test_indices = [
                wi for pi in test_pages_indices
                for wi in range(accum_page_sizes[pi], accum_page_sizes[pi + 1])
            ]
            train_indices = [
                i for i in range(num_samples) if i not in test_indices
            ]

            log.d("Calculating topic space with {} samples...".format(
                len(train_indices)))

            svd = TruncatedSVD(n_components=256)
            svd.fit(feat_mat[train_indices])
            """
            Evaluation
            """

            # in contrast to qbs-lsa we want to query all trajectories here
            corpus_words = gw_words[test_indices]
            query_words = corpus_words

            # for test corpus only take visual features
            corpus_mat = np.zeros(shape=(len(test_indices), n_features))
            corpus_mat[:, :n_visual_features] = feat_mat[
                test_indices, :n_visual_features]

            corpus_mat = svd.transform(corpus_mat)

            # for test queries only take trajectory features
            query_mat = np.zeros(shape=(len(test_indices), n_features))
            query_mat[:, n_visual_features:] = feat_mat[test_indices,
                                                        n_visual_features:]
            query_mat = svd.transform(query_mat)

            print("")
            stats.append(
                evaluation.run_evaluation(query_mat, query_words, corpus_mat,
                                          corpus_words))
            print("")

        log.d(
            "online centroids: {}, online spatial pyramid: {}, visual words: {}, visual spatial pyramid: {}"
            .format(nonlc, osp, nvisc, vsp))
        evaluation.log_xval_stats(stats)

        test_stats = evaluation.get_xval_stats(stats)
        if test_stats["mAP"] > best_map:
            best_map = test_stats["mAP"]
            best_param_comb = param_comb

    log.d("Best parameter configuration is")
    log.d(best_param_comb)
def __log_results(header, mr, mp, map):
    print("")
    log.d(header)
    log.d("mR  {0:.2f}%".format(mr))
    log.d("mP  {0:.2f}%".format(mp))
    log.d("mAP {0:.2f}%".format(map))
def run_experiment():
    ma_data = "/home/cwieprec/Work/MA.data"
    precomputed = "{}/precomputed".format(ma_data)
    xval_num_folds = 4

    # list of dictionaries that contain all parameter combinations that will be run
    param_combs = [{"vw": 2048, "vsp": [[3, 2], [2, 1]], "ow": 512, "osp": [[9, 2], [3, 2]]}]

    # load annotation data
    gw_trans_data_pages = __get_gw_trans_data_pages(ma_data)

    # all word annotations in dataset
    gw_words = np.array([w.word for p in gw_trans_data_pages for w in p])

    num_pages = len(gw_trans_data_pages)
    accum_page_sizes = [0] + list(mathutils.accumulate([len(p) for p in gw_trans_data_pages]))
    num_samples = accum_page_sizes[-1]
    fold_size = num_pages / xval_num_folds

    best_map = 0.0
    best_param_comb = {}

    for param_comb in param_combs:
        nonlc = param_comb["ow"]
        osp = param_comb["osp"]
        nvisc = param_comb["vw"]
        vsp = param_comb["vsp"]

        log.d("Loading and concatenating feature matrices...")
        gwo_traj_feat_mat = np.loadtxt("{}/{}.txt.gz".format(precomputed,__gwo_param_config_string(nonlc, osp)))
        vis_feat_mat = np.loadtxt("{}/{}.txt.gz".format(precomputed, __gw_param_config_string(nvisc, vsp)))

        # build shared feature matrix
        feat_mat = np.concatenate((vis_feat_mat, gwo_traj_feat_mat), axis=1)

        stats = []
        n_visual_features = vis_feat_mat.shape[1]
        n_online_features = gwo_traj_feat_mat.shape[1]
        n_features = n_visual_features + n_online_features

        for test_fold in range(xval_num_folds):
            test_pages_indices = range(test_fold * fold_size, test_fold * fold_size + fold_size)
            test_indices = [wi for pi in test_pages_indices for wi in range(accum_page_sizes[pi], accum_page_sizes[pi+1])]
            train_indices = [i for i in range(num_samples) if i not in test_indices]

            log.d("Calculating topic space with {} samples...".format(len(train_indices)))

            svd = TruncatedSVD(n_components=256)
            svd.fit(feat_mat[train_indices])

            """
            Evaluation
            """

            # in contrast to qbs-lsa we want to query all trajectories here
            corpus_words = gw_words[test_indices]
            query_words = corpus_words

            # for test corpus only take visual features
            corpus_mat = np.zeros(shape=(len(test_indices), n_features))
            corpus_mat[:, :n_visual_features] = feat_mat[test_indices, :n_visual_features]

            corpus_mat = svd.transform(corpus_mat)

            # for test queries only take trajectory features
            query_mat = np.zeros(shape=(len(test_indices), n_features))
            query_mat[:, n_visual_features:] = feat_mat[test_indices, n_visual_features:]
            query_mat = svd.transform(query_mat)

            print("")
            stats.append(evaluation.run_evaluation(query_mat, query_words, corpus_mat, corpus_words))
            print("")

        log.d("online centroids: {}, online spatial pyramid: {}, visual words: {}, visual spatial pyramid: {}".format(nonlc, osp, nvisc, vsp))
        evaluation.log_xval_stats(stats)

        test_stats = evaluation.get_xval_stats(stats)
        if test_stats["mAP"] > best_map:
            best_map = test_stats["mAP"]
            best_param_comb = param_comb

    log.d("Best parameter configuration is")
    log.d(best_param_comb)
def run_experiment():
    ma_data = "/home/chris/Work/MA.data"
    precomputed = "{}/precomputed".format(ma_data)
    xval_num_folds = 4

    # list of dictionaries that contain all parameter combinations that will be run
    param_combs = [{"vw": 2048, "vsp": [[2, 1], [1, 1]], "ow": 128, "osp": [[9, 2], [3, 2]]}]

    # load annotation data
    gw_trans_data_pages = __get_gw_trans_data_pages(ma_data)

    # all word annotations in dataset
    gw_words = np.array([w.word for p in gw_trans_data_pages for w in p])

    log.d("Building textual descriptors...")
    phoc_feature_generator = PhocFeatureGenerator(gw_trans_data_pages)
    gw_text_feat_mat = phoc_feature_generator.build_textual_feature_vectors_matrix()

    num_pages = len(gw_trans_data_pages)
    accum_page_sizes = [0] + list(mathutils.accumulate([len(p) for p in gw_trans_data_pages]))
    num_samples = accum_page_sizes[-1]
    fold_size = num_pages / xval_num_folds

    best_map = 0.0
    best_param_comb = {}

    for param_comb in param_combs:
        nonlc = param_comb["ow"]
        osp = param_comb["osp"]
        nvisc = param_comb["vw"]
        vsp = param_comb["vsp"]

        gwo_traj_feat_mat = np.loadtxt("{}/{}.txt.gz".format(precomputed,__gwo_param_config_string(nonlc, osp)))
        vis_feat_mat = np.loadtxt("{}/{}.txt.gz".format(precomputed, __gw_param_config_string(nvisc, vsp)))

        stats = []

        for test_fold in range(xval_num_folds):
            test_pages_indices = range(test_fold * fold_size, test_fold * fold_size + fold_size)
            test_indices = [wi for pi in test_pages_indices for wi in range(accum_page_sizes[pi], accum_page_sizes[pi+1])]
            train_indices = [i for i in range(num_samples) if i not in test_indices]

            """
            Train handwriting SVMs
            """
            traj_svms = AttributesSVMGenerator()
            traj_svms.fit(gwo_traj_feat_mat[train_indices], gw_text_feat_mat[train_indices], platts=False)

            """
            Train visual SVMs
            """
            svms = AttributesSVMGenerator()
            svms.fit(vis_feat_mat[train_indices], gw_text_feat_mat[train_indices])

            # optionally learn regression which will set transform matrices in the svm objects
            # learn_regression(svms, traj_svms)

            """
            Evaluation
            """
            corpus_words = gw_words[test_indices]
            query_words = corpus_words

            corpus_mat = svms.score(vis_feat_mat[test_indices])
            query_mat = traj_svms.score(gwo_traj_feat_mat[test_indices])

            stats.append(evaluation.run_evaluation(query_mat, query_words, corpus_mat, corpus_words))
            print("")

        log.d("online centroids: {}, online spatial pyramid: {}, visual words: {}, visual spatial pyramid: {}".format(nonlc, osp, nvisc, vsp))
        evaluation.log_xval_stats(stats)

        test_stats = evaluation.get_xval_stats(stats)
        if test_stats["mAP"] > best_map:
            best_map = test_stats["mAP"]
            best_param_comb = param_comb

    log.d("Best parameter configuration is")
    log.d(best_param_comb)
def run_evaluation_with_invocab(query_mat,
                                query_words,
                                corpus_mat,
                                corpus_words,
                                train_vocab,
                                drop_first=False):
    """
    Same as run_evaluation, but also saving separate statistics for in vocabulary queries.
    """
    num_queries = len(query_mat)
    log.d("Running {} queries...".format(num_queries))
    recall = 0.0
    precision = 0.0
    avg_precision = 0.0
    iv_num_queries = 0
    iv_recall = 0.0
    iv_precision = 0.0
    iv_avg_precision = 0.0
    up = log.update_progress
    gtr = get_top_results
    er = __evaluate_results
    invalid = 0
    for i, query_vec in enumerate(query_mat):
        word = query_words[i]
        in_vocab = word in train_vocab
        top_results = gtr(corpus_mat,
                          query_vec,
                          corpus_words,
                          corpus_mat.shape[0],
                          drop_first=drop_first)
        query_occurences = sum([1 for w in corpus_words if w == word])
        if drop_first:
            query_occurences -= 1
        ap, rec, prec = er(top_results,
                           word,
                           query_occurences,
                           logresults=False)
        if ap is None:
            # dont count invalid queries
            if not in_vocab:
                invalid += 1
        else:
            if in_vocab:
                iv_recall += rec
                iv_precision += prec
                iv_avg_precision += ap
                iv_num_queries += 1
            recall += rec
            precision += prec
            avg_precision += ap
        up(i + 1, num_queries)
    print("")

    num_queries = num_queries - invalid
    if num_queries > 0:
        recall /= num_queries
        precision /= num_queries
        avg_precision /= num_queries

        __log_results("Results for {} overall queries:".format(num_queries),
                      recall, precision, avg_precision)
    else:
        log.e("No queries.")

    if iv_num_queries > 0:
        iv_recall /= iv_num_queries
        iv_precision /= iv_num_queries
        iv_avg_precision /= iv_num_queries

        __log_results(
            "Results for {} in-vocabulary queries:".format(iv_num_queries),
            iv_recall, iv_precision, iv_avg_precision)
    else:
        log.e("No in vocabulary queries.")

    return [
        num_queries, recall, precision, avg_precision, iv_num_queries,
        iv_recall, iv_precision, iv_avg_precision
    ]
def run_experiment():
    ma_data = "/home/cwieprec/Work/MA.data"
    precomputed = "{}/precomputed".format(ma_data)
    xval_num_folds = 4

    # list of dictionaries that contain all parameter combinations that will be run
    param_combs = [{"vw": 2048, "vsp": [[2, 1], [1, 1]], "ow": 128, "osp": [[9, 2], [3, 2]]}]

    # load annotation data
    unipen_trans_data_pages = __get_unipen_trans_data_pages(ma_data)
    gw_trans_data_pages = __get_gw_trans_data_pages(ma_data)

    # find words, that are present in both gw and unipen datasets
    words = np.array([w.word for p in gw_trans_data_pages for w in p])
    unipen_words = np.array([w.word for p in unipen_trans_data_pages for w in p])
    shared_words = mathutils.remove_duplicates([w for w in unipen_words if w in words])
    log.d("{} unique words".format(len(shared_words)))

    # calculate indices for feature matrices to use only those shared words
    gw_indices = [i for i, w in enumerate(words) if w in shared_words]
    gw_words = np.array([w for w in words if w in shared_words])
    unipen_indices = [i for i, w in enumerate(unipen_words) if w in shared_words]

    log.d("Building textual descriptors...")
    phoc_feature_generator = PhocFeatureGenerator(gw_trans_data_pages)
    gw_text_feat_mat = phoc_feature_generator.build_textual_feature_vectors_matrix()[gw_indices]
    unipen_text_feat_mat = phoc_feature_generator.build_textual_feature_vectors_matrix(unipen_trans_data_pages)[unipen_indices]

    num_pages = len(gw_trans_data_pages)
    accum_page_sizes = [0] + list(mathutils.accumulate([len([w for w in p if w.word in shared_words]) for p in gw_trans_data_pages]))
    num_samples = accum_page_sizes[-1]
    fold_size = num_pages / xval_num_folds

    best_map = 0.0
    best_param_comb = {}

    traj_svm_cache = {}

    for param_comb in param_combs:
        nonlc = param_comb["ow"]
        osp = param_comb["osp"]
        nvisc = param_comb["vw"]
        vsp = param_comb["vsp"]

        unipen_traj_feat_mat = np.loadtxt("{}/{}.txt.gz".format(precomputed,__unipen_param_config_string(nonlc, osp)))[unipen_indices]
        gwo_traj_feat_mat = np.loadtxt("{}/{}.txt.gz".format(precomputed,__gwo_param_config_string(nonlc, osp)))[gw_indices]

        """
        Train/Load online-handwriting SVMs
        """
        # unipen is always used as train set as a whole
        svm_key = str(nonlc) + str(osp)
        if svm_key in traj_svm_cache:
            traj_svms = traj_svm_cache[svm_key]
        else:
            traj_svms = AttributesSVMGenerator()
            traj_svms.fit(unipen_traj_feat_mat, unipen_text_feat_mat, platts=False)
            traj_svm_cache[svm_key] = traj_svms

        vis_feat_mat = np.loadtxt("{}/{}.txt.gz".format(precomputed, __gw_param_config_string(nvisc, vsp)))[gw_indices]

        stats = []

        for test_fold in range(xval_num_folds):
            test_pages_indices = range(test_fold * fold_size, test_fold * fold_size + fold_size)
            test_indices = [wi for pi in test_pages_indices for wi in range(accum_page_sizes[pi], accum_page_sizes[pi+1])]
            train_indices = [i for i in range(num_samples) if i not in test_indices]

            """
            Train visual SVMs
            """
            svms = AttributesSVMGenerator()
            svms.fit(vis_feat_mat[train_indices], gw_text_feat_mat[train_indices])

            # optionally learn regression which will set transform matrices in the svm objects
            # learn_regression(svms, traj_svms)

            """
            Evaluation
            """
            corpus_words = gw_words[test_indices]
            query_words = corpus_words

            corpus_mat = svms.score(vis_feat_mat[test_indices])
            query_mat = traj_svms.score(gwo_traj_feat_mat[test_indices])

            stats.append(evaluation.run_evaluation(query_mat, query_words, corpus_mat, corpus_words))
            print("")

        log.d("online centroids: {}, online spatial pyramid: {}, visual words: {}, visual spatial pyramid: {}".format(nonlc, osp, nvisc, vsp))
        evaluation.log_xval_stats(stats)

        test_stats = evaluation.get_xval_stats(stats)
        if test_stats["mAP"] > best_map:
            best_map = test_stats["mAP"]
            best_param_comb = param_comb

    log.d("Best parameter configuration is")
    log.d(best_param_comb)
 def __learn_sigmoid_params(self, data_mat):
     log.d("Platt's scaling (visual feature SVMs)...")
     class_labels = svm.predict_embedded_attributes_labels(data_mat, self.attribute_svms)
     self.sigmoid_params = platt.learn_platts_scaling_params(self.train_scores, class_labels)
     del class_labels
def run_evaluation_with_invocab(query_mat, query_words, corpus_mat, corpus_words, train_vocab, drop_first=False):
    """
    Same as run_evaluation, but also saving separate statistics for in vocabulary queries.
    """
    num_queries = len(query_mat)
    log.d("Running {} queries...".format(num_queries))
    recall = 0.0
    precision = 0.0
    avg_precision = 0.0
    iv_num_queries = 0
    iv_recall = 0.0
    iv_precision = 0.0
    iv_avg_precision = 0.0
    up = log.update_progress
    gtr = get_top_results
    er = __evaluate_results
    invalid = 0
    for i, query_vec in enumerate(query_mat):
        word = query_words[i]
        in_vocab = word in train_vocab
        top_results = gtr(corpus_mat, query_vec, corpus_words, corpus_mat.shape[0], drop_first=drop_first)
        query_occurences = sum([1 for w in corpus_words if w == word])
        if drop_first:
            query_occurences -= 1
        ap, rec, prec = er(top_results, word, query_occurences, logresults=False)
        if ap is None:
            # dont count invalid queries
            if not in_vocab:
                invalid += 1
        else:
            if in_vocab:
                iv_recall += rec
                iv_precision += prec
                iv_avg_precision += ap
                iv_num_queries += 1
            recall += rec
            precision += prec
            avg_precision += ap
        up(i+1, num_queries)
    print("")

    num_queries = num_queries - invalid
    if num_queries > 0:
        recall /= num_queries
        precision /= num_queries
        avg_precision /= num_queries

        __log_results("Results for {} overall queries:".format(num_queries), recall, precision, avg_precision)
    else:
        log.e("No queries.")

    if iv_num_queries > 0:
        iv_recall /= iv_num_queries
        iv_precision /= iv_num_queries
        iv_avg_precision /= iv_num_queries

        __log_results("Results for {} in-vocabulary queries:".format(iv_num_queries), iv_recall, iv_precision, iv_avg_precision)
    else:
        log.e("No in vocabulary queries.")

    return [num_queries, recall, precision, avg_precision, iv_num_queries, iv_recall, iv_precision, iv_avg_precision]