def load_page_data_ls12(data_folder, filenames): """ Loads keypoints and labels for each file in filenames. data_folder must contain three files per entry in 'filenames': <filename>.keypoints_x, <filename>.keypoints_y, <filename>.vw. @param data_folder: Folder containing text files for each page in filenames. @param filenames: List containing page filenames WITHOUT file-extension. @return: Keypoints-list, labels-list and page-sizes-list. """ keypoints = [] labels = [] for f in filenames: x = [] y = [] with open("{}/{}.keypoints_x".format(data_folder, f), "r") as infile: x_list = infile.readlines() x.extend([int(float(n.strip())) for n in x_list]) with open("{}/{}.keypoints_y".format(data_folder, f), "r") as infile: y_list = infile.readlines() y.extend([int(float(n.strip())) for n in y_list]) keypoints.append(np.array([x, y]).T) with open("{}/{}.vw".format(data_folder, f), "r") as infile: vw_list = infile.readlines() labels.append([int(float(n.strip()))-1 for n in vw_list]) log.d("Read {} keypoints and labels for page '{}'".format(len(y_list), f)) return keypoints, labels
def __evaluate_results(top_results, query_word, query_occurences, logresults=True): """ Calculate key figures from query results. @param top_results: List containing best results sorted ascending by distance to query feature vector. Contains tuples with format (wordstring, distance to query). @param query_word: Word that was searched. @param logresults: If True, also print recall, precision and average precision. @return: Average Precision, Recall, Precision """ if query_occurences == 0: if logresults: log.d("Queryword {} is not represented in searched text.".format(query_word)) return None, None, None num_results = len(top_results) found = 0 it_num = 1 top = 0 for _, word, dist in top_results: if word == query_word: found += 1 top += float(found)/it_num it_num += 1 ap = 100 * float(top) / found if found != 0 else 0.0 recall = 100 * float(found) / query_occurences precision = 100 * float(found) / num_results if logresults: __log_results("Query: '{}'".format(query_word), recall, precision, ap) return ap, recall, precision
def build_feature_vectors_matrix(self, trans_data_pages, keypoint_data_pages, label_data_pages, step_size): """ Builds feature matrix containing one row per word image and one column per feature. @param trans_data_pages: List of pages. Each page is a list containing TransData objects for each word. @param keypoints: List of all keypoints (concatenated from each page). @param labels: List of all labels (concatenated from each page). @param step_size: Distance between keypoints. @return: Matrix containing one row per word and one column per feature. """ label_matrices = self.__build_pages_matrices(keypoint_data_pages, label_data_pages) num_pages = len(label_matrices) num_rows = sum([len(t) for t in trans_data_pages]) feat_mat = np.zeros(shape=(num_rows, self.spatial_pyramid.descriptor_size())) i = 0 for page_idx in range(num_pages): keypoints = keypoint_data_pages[page_idx] label_matrix = label_matrices[page_idx] for word_data in trans_data_pages[page_idx]: origin = keypoints[0] x = math.ceil((word_data.xstart - origin[0]) / float(step_size)) y = math.ceil((word_data.ystart - origin[1]) / float(step_size)) dx = math.floor(word_data.width / float(step_size)) dy = math.floor(word_data.height / float(step_size)) # in terms of matrix notation, y is row here! # +1 is necessary, because in python 1:5 is 1,2,3,4! desc_mat = label_matrix[int(y):int(y+dy+1), int(x):int(x+dx+1)] # print(desc_mat.shape) visual_descriptor = self.spatial_pyramid.calculate_descriptor_from_mat(desc_mat) feat_mat[i] = visual_descriptor # fw_matrix[i] = mathutils.normalize(fw_matrix[i]) i += 1 log.d("Visual feature-matrix has shape {}".format(feat_mat.shape)) return feat_mat
def precalculate_vis(nviscs, vsps): step_size = 5 # contains .npy files for each page train_page_data_folder = "{}/gw_ls12/page_data".format(ROOT_FOLDER) # contains word image annotations for each page train_page_trans_folder = "{}/gw_ls12/GT".format(ROOT_FOLDER) filenames = ['2700270', '2710271', '2720272', '2730273', '2740274', '2750275', '2760276', '2770277', '2780278', '2790279', '3000300', '3010301', '3020302', '3030303', '3040304', '3050305', '3060306', '3070307', '3080308', '3090309'] trans_data_pages = trans.load_transcription_data(train_page_trans_folder, filenames) for nvisc in nviscs: log.d("Loading data for {} training pages...".format(len(filenames))) page_data_files = ["{}/{}/{}.npy".format(train_page_data_folder, nvisc, f) for f in filenames] keypoint_data_pages, label_data_pages = visimport.load_page_data(page_data_files) for vsp in vsps: spatial_pyramid = SpatialPyramid(vsp, nvisc) vis_feat_mat = BofGenerator(spatial_pyramid).build_feature_vectors_matrix(trans_data_pages, keypoint_data_pages, label_data_pages, step_size) outname = __gw_param_config_string(nvisc, vsp) + ".txt.gz" log.d("Saving {}".format(outname)) np.savetxt("{}/{}".format(OUT_FOLDER, outname), vis_feat_mat)
def precalculate_unipen(nonlcs, osps): unipen_folders = ["aeb", "asl", "ben", "cb", "ckb", "dlm", "etb", "gl", "ja", "jdc", "jhc", "jma", "kaj", "kew", "ksc", "lac", "lcf", "mek", "mml", "mmm", "nco", "pm", "rn", "rv", "sbc", "scd", "sd", "sij", "skh", "skw", "srs"] # we need to filter out the few word occurences in the chosen unipen subset that contain uppercase letters contains_uppercase = lambda t: any([c in t for c in "ABCDEFGHIJKLMNOPQRSTUVWXYZ"]) unipen_traj_keypoint_paths = ["{}/unipen/keypointss/sta0.hpb0-{}/{}".format(ROOT_FOLDER, upfol, t) for upfol in unipen_folders for t in sorted(os.listdir("{}/unipen/keypointss/sta0.hpb0-{}".format(ROOT_FOLDER, upfol))) if not contains_uppercase(t)] for nonlc in nonlcs: unipen_traj_label_paths = [p.replace("keypointss", "labels/{}".format(nonlc)).replace("keypoints", "ow") for p in unipen_traj_keypoint_paths] unipen_traj_keypoint_data = [trajimport.read_traj_keypoints(p) for p in unipen_traj_keypoint_paths] unipen_traj_label_data = [trajimport.read_traj_labels(p) for p in unipen_traj_label_paths] for osp in osps: print("") log.d("[Loading Trainingdata]") traj_spatial_pyramid = SpatialPyramid(osp, nonlc) test_boof_generator = BoofGenerator(traj_spatial_pyramid) traj_feat_mat = test_boof_generator.build_feature_vectors_matrix(unipen_traj_keypoint_data, unipen_traj_label_data) outname = __unipen_param_config_string(nonlc, osp) + ".txt.gz" log.d("Saving {}".format(outname)) np.savetxt("{}/{}".format(OUT_FOLDER, outname), traj_feat_mat)
def precalculate_gwo(nonlcs, osps): filenames = ['2700270', '2710271', '2720272', '2730273', '2740274', '2750275', '2760276', '2770277', '2780278', '2790279', '3000300', '3010301', '3020302', '3030303', '3040304', '3050305', '3060306', '3070307', '3080308', '3090309'] gwo_traj_folder = "{}/gw_online".format(ROOT_FOLDER) gwo_traj_keypoint_paths = [] for folder in filenames: path = "{}/keypointss/{}".format(gwo_traj_folder, folder) traj_names = sorted(file for file in os.listdir(path) if file.endswith(".keypoints")) gwo_traj_keypoint_paths.extend(["{}/{}".format(path, traj) for traj in traj_names]) for nonlc in nonlcs: gwo_traj_label_paths = [p.replace("keypointss", "labels/{}".format(nonlc)).replace("keypoints", "ow") for p in gwo_traj_keypoint_paths] gwo_traj_keypoint_data = [trajimport.read_traj_keypoints(p) for p in gwo_traj_keypoint_paths] gwo_traj_label_data = [trajimport.read_traj_labels(p) for p in gwo_traj_label_paths] for osp in osps: print("") log.d("[Loading Trainingdata]") traj_spatial_pyramid = SpatialPyramid(osp, nonlc) test_boof_generator = BoofGenerator(traj_spatial_pyramid) traj_feat_mat = test_boof_generator.build_feature_vectors_matrix(gwo_traj_keypoint_data, gwo_traj_label_data) outname = __gwo_param_config_string(nonlc, osp) + ".txt.gz" log.d("Saving {}".format(outname)) np.savetxt("{}/{}".format(OUT_FOLDER, outname), traj_feat_mat)
def __evaluate_results(top_results, query_word, query_occurences, logresults=True): """ Calculate key figures from query results. @param top_results: List containing best results sorted ascending by distance to query feature vector. Contains tuples with format (wordstring, distance to query). @param query_word: Word that was searched. @param logresults: If True, also print recall, precision and average precision. @return: Average Precision, Recall, Precision """ if query_occurences == 0: if logresults: log.d("Queryword {} is not represented in searched text.".format( query_word)) return None, None, None num_results = len(top_results) found = 0 it_num = 1 top = 0 for _, word, dist in top_results: if word == query_word: found += 1 top += float(found) / it_num it_num += 1 ap = 100 * float(top) / found if found != 0 else 0.0 recall = 100 * float(found) / query_occurences precision = 100 * float(found) / num_results if logresults: __log_results("Query: '{}'".format(query_word), recall, precision, ap) return ap, recall, precision
def load_page_data_ls12(data_folder, filenames): """ Loads keypoints and labels for each file in filenames. data_folder must contain three files per entry in 'filenames': <filename>.keypoints_x, <filename>.keypoints_y, <filename>.vw. @param data_folder: Folder containing text files for each page in filenames. @param filenames: List containing page filenames WITHOUT file-extension. @return: Keypoints-list, labels-list and page-sizes-list. """ keypoints = [] labels = [] for f in filenames: x = [] y = [] with open("{}/{}.keypoints_x".format(data_folder, f), "r") as infile: x_list = infile.readlines() x.extend([int(float(n.strip())) for n in x_list]) with open("{}/{}.keypoints_y".format(data_folder, f), "r") as infile: y_list = infile.readlines() y.extend([int(float(n.strip())) for n in y_list]) keypoints.append(np.array([x, y]).T) with open("{}/{}.vw".format(data_folder, f), "r") as infile: vw_list = infile.readlines() labels.append([int(float(n.strip())) - 1 for n in vw_list]) log.d("Read {} keypoints and labels for page '{}'".format( len(y_list), f)) return keypoints, labels
def learn_embedded_attributes(data_mat, phoc_mat): """ Learns one SVM for each PHOC-attribute. @param data_mat: Each row is a feature vector. @param phoc_mat: Each row is a phoc vector. @return: One SVM per PHOC attribute. """ num_attributes = phoc_mat.shape[1] svms = np.empty(num_attributes, dtype=object) invalid = 0 log.d("Training SVMs for {} attributes...".format(num_attributes)) for att in range(num_attributes): log.update_progress(att + 1, num_attributes) labels = phoc_mat[:, att] # if we have only one class (either 1 or 0) we can't train a svm if sum(labels) == 0 or sum(labels) == num_attributes: svms[att] = None invalid += 1 continue clf = svm.LinearSVC() clf.fit(data_mat, labels) svms[att] = clf print("") log.d("{} invalid attributes".format(invalid)) return svms
def extract_ow(): filenames = [ "{}/{}/{}".format(RAW_TRAJ_FOLDER, dir, f) for dir in os.listdir(RAW_TRAJ_FOLDER) if os.path.isdir("{}/{}".format(RAW_TRAJ_FOLDER, dir)) for f in os.listdir("{}/{}".format(RAW_TRAJ_FOLDER, dir)) if f.endswith(".txt") ] log.d("Loading trajectories and calculating feature vectors...") feature_vectors = [] word_sizes = [] points_normed = [] for i, f in enumerate(filenames): traj, _ = read_trajectory_from_file(f) traj_normed = norm.normalize_trajectory(traj, NORM_ARGS) write_keypoints(traj_normed, f, ROOT_FOLDER) feat_vec = feat.calculate_feature_vector_sequence( traj_normed, FEAT_ARGS) feature_vectors.extend(feat_vec) word_sizes.append(len(traj_normed)) points_normed.extend([[int(n) for n in p] for p in traj_normed]) log.update_progress(i + 1, len(filenames)) print("") log.d("Accumulated {} feature vectors.".format(len(feature_vectors))) label_offsets = [0] + list(mathutils.accumulate(word_sizes)) # cluster feature vectors log.d("Using Lloyd's algorithm to find {} clusters...".format(N_CENTROIDS)) clusters, labels = clustering.cluster(np.array(feature_vectors), N_CENTROIDS) # create output dir try: os.makedirs("{}/unipen/clusters/{}".format(ROOT_FOLDER, N_CENTROIDS)) except: pass # write codebook with open( "{}/unipen/clusters/{}/clusters_online_up_17feat.txt".format( ROOT_FOLDER, N_CENTROIDS), "w") as outfile: log.d("Writing {} clusters...".format(clusters.shape[0])) for c in clusters: outfile.write(' '.join([str(feature) for feature in c]) + '\n') # save labels for each trajectory for idx, name in enumerate(filenames): out_writer_dir, fname = keypoints_outfile_for_inpath( name, N_CENTROIDS, ROOT_FOLDER) log.d("Writing {}.ow".format(fname)) with open("{}/{}.ow".format(out_writer_dir, fname), "w") as outfile: start_label_idx = label_offsets[idx] end_label_idx = label_offsets[idx + 1] print("... indices {} to {}".format(start_label_idx, end_label_idx)) current_labels = labels[start_label_idx:end_label_idx] outfile.write('\n'.join([str(item) for item in current_labels]))
def save_to_file(self, path): log.d("Saving svms to {}".format(path)) f = file(path, "wb") np.save(f, self.attribute_svms) np.save(f, self.platts) np.save(f, self.transform) if self.platts: np.save(f, self.sigmoid_params) f.close()
def run_evaluation(query_mat, query_words, corpus_mat, corpus_words, drop_first=False): """ Using each feature vector (row) in query_mat once as query, run comparisons with all feature vectors (rows) in corpus_mat. """ num_queries = len(query_mat) log.d("Running {} queries...".format(num_queries)) recall = 0.0 precision = 0.0 avg_precision = 0.0 up = log.update_progress gtr = get_top_results er = __evaluate_results invalid = 0 for i, query_vec in enumerate(query_mat): word = query_words[i] top_results = gtr(corpus_mat, query_vec, corpus_words, corpus_mat.shape[0], drop_first=drop_first) query_occurences = sum([1 for w in corpus_words if w == word]) if drop_first: query_occurences -= 1 ap, rec, prec = er(top_results, word, query_occurences, logresults=False) if ap is None: # dont count invalid queries invalid += 1 else: recall += rec precision += prec avg_precision += ap up(i + 1, num_queries) print("") num_queries -= invalid if num_queries == 0: log.e("No valid queries") return [0, 0, 0, 0] recall /= num_queries precision /= num_queries avg_precision /= num_queries __log_results("Results for {} queries:".format(num_queries), recall, precision, avg_precision) return [num_queries, recall, precision, avg_precision]
def build_feature_vectors_matrix(self, keypoints, labels): log.d("Building trajectory feature matrix...") num_features = self.spatial_pyramid.descriptor_size() num_examples = len(keypoints) feat_mat = np.zeros(shape=(num_examples, num_features)) i = 0 for keyp, lab in zip(keypoints, labels): feat_mat[i] = self.__build_feature_vector(keyp, lab) i += 1 log.update_progress(i+1, num_examples) print("") log.d("Accumulated {} feature vectors.".format(len(feat_mat))) return np.array(feat_mat)
def build_feature_vectors_matrix(self, keypoints, labels): log.d("Building trajectory feature matrix...") num_features = self.spatial_pyramid.descriptor_size() num_examples = len(keypoints) feat_mat = np.zeros(shape=(num_examples, num_features)) i = 0 for keyp, lab in zip(keypoints, labels): feat_mat[i] = self.__build_feature_vector(keyp, lab) i += 1 log.update_progress(i + 1, num_examples) print("") log.d("Accumulated {} feature vectors.".format(len(feat_mat))) return np.array(feat_mat)
def extract_ow(): filenames = ["{}/{}/{}".format(RAW_TRAJ_FOLDER, dir, f) for dir in os.listdir(RAW_TRAJ_FOLDER) if os.path.isdir("{}/{}".format(RAW_TRAJ_FOLDER, dir)) for f in os.listdir("{}/{}".format(RAW_TRAJ_FOLDER, dir)) if f.endswith(".txt")] log.d("Loading trajectories and calculating feature vectors...") feature_vectors = [] word_sizes = [] points_normed = [] for i, f in enumerate(filenames): traj, _ = read_trajectory_from_file(f) traj_normed = norm.normalize_trajectory(traj, NORM_ARGS) write_keypoints(traj_normed, f, ROOT_FOLDER) feat_vec = feat.calculate_feature_vector_sequence(traj_normed, FEAT_ARGS) feature_vectors.extend(feat_vec) word_sizes.append(len(traj_normed)) points_normed.extend([[int(n) for n in p] for p in traj_normed]) log.update_progress(i+1, len(filenames)) print("") log.d("Accumulated {} feature vectors.".format(len(feature_vectors))) label_offsets = [0] + list(mathutils.accumulate(word_sizes)) # cluster feature vectors log.d("Using Lloyd's algorithm to find {} clusters...".format(N_CENTROIDS)) clusters, labels = clustering.cluster(np.array(feature_vectors), N_CENTROIDS) # create output dir try: os.makedirs("{}/unipen/clusters/{}".format(ROOT_FOLDER, N_CENTROIDS)) except: pass # write codebook with open("{}/unipen/clusters/{}/clusters_online_up_17feat.txt".format(ROOT_FOLDER, N_CENTROIDS), "w") as outfile: log.d("Writing {} clusters...".format(clusters.shape[0])) for c in clusters: outfile.write(' '.join([str(feature) for feature in c]) + '\n') # save labels for each trajectory for idx, name in enumerate(filenames): out_writer_dir, fname = keypoints_outfile_for_inpath(name, N_CENTROIDS, ROOT_FOLDER) log.d("Writing {}.ow".format(fname)) with open("{}/{}.ow".format(out_writer_dir, fname), "w") as outfile: start_label_idx = label_offsets[idx] end_label_idx = label_offsets[idx+1] print("... indices {} to {}".format(start_label_idx, end_label_idx)) current_labels = labels[start_label_idx:end_label_idx] outfile.write('\n'.join([str(item) for item in current_labels]))
def extract_ow(): ma_data = "/home/chris/Work/MA.data" pages = ['2700270'] #, '2710271', '2720272', '2730273', '2740274', #'2750275', '2760276', '2770277', '2780278', '2790279', #'3000300', '3010301', '3020302', '3030303', '3040304', #'3050305', '3060306', '3070307', '3080308', '3090309'] train_traj_folder = "{}/gw_online".format(ma_data) filenames = [] for folder in pages: path = "{}/{}".format(train_traj_folder, folder) traj_names = sorted(file for file in os.listdir(path) if file.endswith(".txt")) filenames.extend(["{}/{}".format(path, traj) for traj in traj_names]) # load trajectories log.d("Loading trajectories and calculating feature vectors...") # cluster feature vectors word_sizes = [] labels = [] traj_clusters = trajimport.read_traj_clusters(CODEBOOK_FILE) for i, f in enumerate(filenames): traj, _ = read_trajectory_from_file(f) traj_normed = norm.normalize_trajectory(traj, NORM_ARGS) write_keypoints(traj_normed, f) feat_vec = feat.calculate_feature_vector_sequence( traj_normed, FEAT_ARGS) labels.extend( quantization.quantize_descriptors(feat_vec, traj_clusters)) #feature_vectors.extend(feat_vec) word_sizes.append(len(traj_normed)) #points_normed.extend([[int(n) for n in p] for p in traj_normed]) log.update_progress(i + 1, len(filenames)) print("") label_offsets = [0] + list(mathutils.accumulate(word_sizes)) # save labels for each trajectory for idx, name in enumerate(filenames): out_writer_dir, fname = keypoints_outfile_for_inpath(name, N_CENTROIDS) log.d("Writing {}.ow".format(fname)) with open("{}/{}.ow".format(out_writer_dir, fname), "w") as outfile: start_label_idx = label_offsets[idx] end_label_idx = label_offsets[idx + 1] print("... indices {} to {}".format(start_label_idx, end_label_idx)) current_labels = labels[start_label_idx:end_label_idx] outfile.write('\n'.join([str(item) for item in current_labels]))
def build_textual_feature_vectors_matrix(self, trans_data_pages=None): """ Builds matrix containing one row per word and one column per feature. """ if trans_data_pages == None: pages = self.trans_data_pages else: pages = trans_data_pages num_rows = sum([len(t) for t in pages]) text_feat_mat = np.zeros(shape=(num_rows, self.feat_vec_size)) i = 0 for trans_page in pages: for word_data in trans_page: text_feat_mat[i] = self.build_textual_feature_vector(word_data.word) i += 1 log.d("n-gram feature-vector-matrix has shape {}".format(text_feat_mat.shape)) return text_feat_mat
def load_page_data(filenames): """ Loads keypoints and labels for each file in filenames. @param filenames: List containing page filenames WITHOUT file-extension. @return: Keypoints-list, labels-list. """ keypoints = [] labels = [] for f in filenames: with open(f, "r") as infile: k = np.load(infile) l = np.load(infile) keypoints.append(np.array(k)) labels.append(np.array(l)) log.d("Read {} keypoints and labels for page '{}'".format(len(k), f)) return keypoints, labels
def load_page_data(filenames): """ Loads keypoints and labels for each file in filenames. @param filenames: List containing page filenames WITHOUT file-extension. @return: Keypoints-list, labels-list. """ keypoints = [] labels = [] for f in filenames: with open(f, "r") as infile: k = np.load(infile) l = np.load(infile) keypoints.append(np.array(k)) labels.append(np.array(l)) log.d("Read {} keypoints and labels for page '{}'".format( len(k), f)) return keypoints, labels
def extract_ow(): ma_data = "/home/chris/Work/MA.data" pages = ['2700270']#, '2710271', '2720272', '2730273', '2740274', #'2750275', '2760276', '2770277', '2780278', '2790279', #'3000300', '3010301', '3020302', '3030303', '3040304', #'3050305', '3060306', '3070307', '3080308', '3090309'] train_traj_folder = "{}/gw_online".format(ma_data) filenames = [] for folder in pages: path = "{}/{}".format(train_traj_folder, folder) traj_names = sorted(file for file in os.listdir(path) if file.endswith(".txt")) filenames.extend(["{}/{}".format(path, traj) for traj in traj_names]) # load trajectories log.d("Loading trajectories and calculating feature vectors...") # cluster feature vectors word_sizes = [] labels = [] traj_clusters = trajimport.read_traj_clusters(CODEBOOK_FILE) for i, f in enumerate(filenames): traj, _ = read_trajectory_from_file(f) traj_normed = norm.normalize_trajectory(traj, NORM_ARGS) write_keypoints(traj_normed, f) feat_vec = feat.calculate_feature_vector_sequence(traj_normed, FEAT_ARGS) labels.extend(quantization.quantize_descriptors(feat_vec, traj_clusters)) #feature_vectors.extend(feat_vec) word_sizes.append(len(traj_normed)) #points_normed.extend([[int(n) for n in p] for p in traj_normed]) log.update_progress(i+1, len(filenames)) print("") label_offsets = [0] + list(mathutils.accumulate(word_sizes)) # save labels for each trajectory for idx, name in enumerate(filenames): out_writer_dir, fname = keypoints_outfile_for_inpath(name, N_CENTROIDS) log.d("Writing {}.ow".format(fname)) with open("{}/{}.ow".format(out_writer_dir, fname), "w") as outfile: start_label_idx = label_offsets[idx] end_label_idx = label_offsets[idx+1] print("... indices {} to {}".format(start_label_idx, end_label_idx)) current_labels = labels[start_label_idx:end_label_idx] outfile.write('\n'.join([str(item) for item in current_labels]))
def build_textual_feature_vectors_matrix(self, trans_data_pages=None): """ Builds matrix containing one row per word and one column per feature. """ if trans_data_pages == None: pages = self.trans_data_pages else: pages = trans_data_pages num_rows = sum([len(t) for t in pages]) text_feat_mat = np.zeros(shape=(num_rows, self.feat_vec_size)) i = 0 for trans_page in pages: for word_data in trans_page: text_feat_mat[i] = self.build_textual_feature_vector( word_data.word) i += 1 log.d("PHOC feature-vector-matrix has shape {}".format( text_feat_mat.shape)) return text_feat_mat
def precalculate_gwo(nonlcs, osps): filenames = [ '2700270', '2710271', '2720272', '2730273', '2740274', '2750275', '2760276', '2770277', '2780278', '2790279', '3000300', '3010301', '3020302', '3030303', '3040304', '3050305', '3060306', '3070307', '3080308', '3090309' ] gwo_traj_folder = "{}/gw_online".format(ROOT_FOLDER) gwo_traj_keypoint_paths = [] for folder in filenames: path = "{}/keypointss/{}".format(gwo_traj_folder, folder) traj_names = sorted(file for file in os.listdir(path) if file.endswith(".keypoints")) gwo_traj_keypoint_paths.extend( ["{}/{}".format(path, traj) for traj in traj_names]) for nonlc in nonlcs: gwo_traj_label_paths = [ p.replace("keypointss", "labels/{}".format(nonlc)).replace("keypoints", "ow") for p in gwo_traj_keypoint_paths ] gwo_traj_keypoint_data = [ trajimport.read_traj_keypoints(p) for p in gwo_traj_keypoint_paths ] gwo_traj_label_data = [ trajimport.read_traj_labels(p) for p in gwo_traj_label_paths ] for osp in osps: print("") log.d("[Loading Trainingdata]") traj_spatial_pyramid = SpatialPyramid(osp, nonlc) test_boof_generator = BoofGenerator(traj_spatial_pyramid) traj_feat_mat = test_boof_generator.build_feature_vectors_matrix( gwo_traj_keypoint_data, gwo_traj_label_data) outname = __gwo_param_config_string(nonlc, osp) + ".txt.gz" log.d("Saving {}".format(outname)) np.savetxt("{}/{}".format(OUT_FOLDER, outname), traj_feat_mat)
def precalculate_unipen(nonlcs, osps): unipen_folders = [ "aeb", "asl", "ben", "cb", "ckb", "dlm", "etb", "gl", "ja", "jdc", "jhc", "jma", "kaj", "kew", "ksc", "lac", "lcf", "mek", "mml", "mmm", "nco", "pm", "rn", "rv", "sbc", "scd", "sd", "sij", "skh", "skw", "srs" ] # we need to filter out the few word occurences in the chosen unipen subset that contain uppercase letters contains_uppercase = lambda t: any( [c in t for c in "ABCDEFGHIJKLMNOPQRSTUVWXYZ"]) unipen_traj_keypoint_paths = [ "{}/unipen/keypointss/sta0.hpb0-{}/{}".format(ROOT_FOLDER, upfol, t) for upfol in unipen_folders for t in sorted( os.listdir("{}/unipen/keypointss/sta0.hpb0-{}".format( ROOT_FOLDER, upfol))) if not contains_uppercase(t) ] for nonlc in nonlcs: unipen_traj_label_paths = [ p.replace("keypointss", "labels/{}".format(nonlc)).replace("keypoints", "ow") for p in unipen_traj_keypoint_paths ] unipen_traj_keypoint_data = [ trajimport.read_traj_keypoints(p) for p in unipen_traj_keypoint_paths ] unipen_traj_label_data = [ trajimport.read_traj_labels(p) for p in unipen_traj_label_paths ] for osp in osps: print("") log.d("[Loading Trainingdata]") traj_spatial_pyramid = SpatialPyramid(osp, nonlc) test_boof_generator = BoofGenerator(traj_spatial_pyramid) traj_feat_mat = test_boof_generator.build_feature_vectors_matrix( unipen_traj_keypoint_data, unipen_traj_label_data) outname = __unipen_param_config_string(nonlc, osp) + ".txt.gz" log.d("Saving {}".format(outname)) np.savetxt("{}/{}".format(OUT_FOLDER, outname), traj_feat_mat)
def build_feature_vectors_matrix(self, trans_data_pages, keypoint_data_pages, label_data_pages, step_size): """ Builds feature matrix containing one row per word image and one column per feature. @param trans_data_pages: List of pages. Each page is a list containing TransData objects for each word. @param keypoints: List of all keypoints (concatenated from each page). @param labels: List of all labels (concatenated from each page). @param step_size: Distance between keypoints. @return: Matrix containing one row per word and one column per feature. """ label_matrices = self.__build_pages_matrices(keypoint_data_pages, label_data_pages) num_pages = len(label_matrices) num_rows = sum([len(t) for t in trans_data_pages]) feat_mat = np.zeros(shape=(num_rows, self.spatial_pyramid.descriptor_size())) i = 0 for page_idx in range(num_pages): keypoints = keypoint_data_pages[page_idx] label_matrix = label_matrices[page_idx] for word_data in trans_data_pages[page_idx]: origin = keypoints[0] x = math.ceil( (word_data.xstart - origin[0]) / float(step_size)) y = math.ceil( (word_data.ystart - origin[1]) / float(step_size)) dx = math.floor(word_data.width / float(step_size)) dy = math.floor(word_data.height / float(step_size)) # in terms of matrix notation, y is row here! # +1 is necessary, because in python 1:5 is 1,2,3,4! desc_mat = label_matrix[int(y):int(y + dy + 1), int(x):int(x + dx + 1)] # print(desc_mat.shape) visual_descriptor = self.spatial_pyramid.calculate_descriptor_from_mat( desc_mat) feat_mat[i] = visual_descriptor # fw_matrix[i] = mathutils.normalize(fw_matrix[i]) i += 1 log.d("Visual feature-matrix has shape {}".format(feat_mat.shape)) return feat_mat
def predict_embedded_attributes_labels(data_mat, svms): """ Calculate class label predictions for each feature vector (=row) in data_mat. @return: Matrix with each column containing class labels for one feature vector. """ num_attributes = len(svms) num_examples = data_mat.shape[0] A = np.zeros(shape=(num_attributes, num_examples)) log.d("Classifying {} examples...".format(num_examples)) for att_idx, svm in enumerate(svms): log.update_progress(att_idx + 1, num_attributes) if svm is not None: if sklearn.__version__ == '0.14.1': A[att_idx] = svm.predict(data_mat) else: # the return format of this function was changed in 0.15... A[att_idx] = svm.predict(data_mat).T print("") return A
def classify_embedded_attributes(data_mat, svms): """ Calculate SVM-scores for each feature vector (=row) in data_mat. @return: Matrix with each column representing PHOC-transformation of one feature vector. """ num_attributes = len(svms) num_examples = data_mat.shape[0] A = np.zeros(shape=(num_attributes, num_examples)) log.d("Classifying {} examples...".format(num_examples)) for att_idx, svm in enumerate(svms): update_progress(att_idx + 1, num_attributes) if svm is not None: if sklearn.__version__ == '0.14.1': A[att_idx] = svm.decision_function(data_mat) else: # the return format of this function was changed in 0.15... A[att_idx] = svm.decision_function(data_mat).T print("") return A
def run_evaluation(query_mat, query_words, corpus_mat, corpus_words, drop_first=False): """ Using each feature vector (row) in query_mat once as query, run comparisons with all feature vectors (rows) in corpus_mat. """ num_queries = len(query_mat) log.d("Running {} queries...".format(num_queries)) recall = 0.0 precision = 0.0 avg_precision = 0.0 up = log.update_progress gtr = get_top_results er = __evaluate_results invalid = 0 for i, query_vec in enumerate(query_mat): word = query_words[i] top_results = gtr(corpus_mat, query_vec, corpus_words, corpus_mat.shape[0], drop_first=drop_first) query_occurences = sum([1 for w in corpus_words if w == word]) if drop_first: query_occurences -= 1 ap, rec, prec = er(top_results, word, query_occurences, logresults=False) if ap is None: # dont count invalid queries invalid += 1 else: recall += rec precision += prec avg_precision += ap up(i+1, num_queries) print("") num_queries -= invalid if num_queries == 0: log.e("No valid queries") return [0, 0, 0, 0] recall /= num_queries precision /= num_queries avg_precision /= num_queries __log_results("Results for {} queries:".format(num_queries), recall, precision, avg_precision) return [num_queries, recall, precision, avg_precision]
def precalculate_vis(nviscs, vsps): step_size = 5 # contains .npy files for each page train_page_data_folder = "{}/gw_ls12/page_data".format(ROOT_FOLDER) # contains word image annotations for each page train_page_trans_folder = "{}/gw_ls12/GT".format(ROOT_FOLDER) filenames = [ '2700270', '2710271', '2720272', '2730273', '2740274', '2750275', '2760276', '2770277', '2780278', '2790279', '3000300', '3010301', '3020302', '3030303', '3040304', '3050305', '3060306', '3070307', '3080308', '3090309' ] trans_data_pages = trans.load_transcription_data(train_page_trans_folder, filenames) for nvisc in nviscs: log.d("Loading data for {} training pages...".format(len(filenames))) page_data_files = [ "{}/{}/{}.npy".format(train_page_data_folder, nvisc, f) for f in filenames ] keypoint_data_pages, label_data_pages = visimport.load_page_data( page_data_files) for vsp in vsps: spatial_pyramid = SpatialPyramid(vsp, nvisc) vis_feat_mat = BofGenerator( spatial_pyramid).build_feature_vectors_matrix( trans_data_pages, keypoint_data_pages, label_data_pages, step_size) outname = __gw_param_config_string(nvisc, vsp) + ".txt.gz" log.d("Saving {}".format(outname)) np.savetxt("{}/{}".format(OUT_FOLDER, outname), vis_feat_mat)
def __log_results(header, mr, mp, map): print("") log.d(header) log.d("mR {0:.2f}%".format(mr)) log.d("mP {0:.2f}%".format(mp)) log.d("mAP {0:.2f}%".format(map))
def run_experiment(): ma_data = "/home/cwieprec/Work/MA.data" precomputed = "{}/precomputed".format(ma_data) xval_num_folds = 4 # list of dictionaries that contain all parameter combinations that will be run param_combs = [{ "vw": 2048, "vsp": [[3, 2], [2, 1]], "ow": 512, "osp": [[9, 2], [3, 2]] }] # load annotation data gw_trans_data_pages = __get_gw_trans_data_pages(ma_data) # all word annotations in dataset gw_words = np.array([w.word for p in gw_trans_data_pages for w in p]) num_pages = len(gw_trans_data_pages) accum_page_sizes = [0] + list( mathutils.accumulate([len(p) for p in gw_trans_data_pages])) num_samples = accum_page_sizes[-1] fold_size = num_pages / xval_num_folds best_map = 0.0 best_param_comb = {} for param_comb in param_combs: nonlc = param_comb["ow"] osp = param_comb["osp"] nvisc = param_comb["vw"] vsp = param_comb["vsp"] log.d("Loading and concatenating feature matrices...") gwo_traj_feat_mat = np.loadtxt("{}/{}.txt.gz".format( precomputed, __gwo_param_config_string(nonlc, osp))) vis_feat_mat = np.loadtxt("{}/{}.txt.gz".format( precomputed, __gw_param_config_string(nvisc, vsp))) # build shared feature matrix feat_mat = np.concatenate((vis_feat_mat, gwo_traj_feat_mat), axis=1) stats = [] n_visual_features = vis_feat_mat.shape[1] n_online_features = gwo_traj_feat_mat.shape[1] n_features = n_visual_features + n_online_features for test_fold in range(xval_num_folds): test_pages_indices = range(test_fold * fold_size, test_fold * fold_size + fold_size) test_indices = [ wi for pi in test_pages_indices for wi in range(accum_page_sizes[pi], accum_page_sizes[pi + 1]) ] train_indices = [ i for i in range(num_samples) if i not in test_indices ] log.d("Calculating topic space with {} samples...".format( len(train_indices))) svd = TruncatedSVD(n_components=256) svd.fit(feat_mat[train_indices]) """ Evaluation """ # in contrast to qbs-lsa we want to query all trajectories here corpus_words = gw_words[test_indices] query_words = corpus_words # for test corpus only take visual features corpus_mat = np.zeros(shape=(len(test_indices), n_features)) corpus_mat[:, :n_visual_features] = feat_mat[ test_indices, :n_visual_features] corpus_mat = svd.transform(corpus_mat) # for test queries only take trajectory features query_mat = np.zeros(shape=(len(test_indices), n_features)) query_mat[:, n_visual_features:] = feat_mat[test_indices, n_visual_features:] query_mat = svd.transform(query_mat) print("") stats.append( evaluation.run_evaluation(query_mat, query_words, corpus_mat, corpus_words)) print("") log.d( "online centroids: {}, online spatial pyramid: {}, visual words: {}, visual spatial pyramid: {}" .format(nonlc, osp, nvisc, vsp)) evaluation.log_xval_stats(stats) test_stats = evaluation.get_xval_stats(stats) if test_stats["mAP"] > best_map: best_map = test_stats["mAP"] best_param_comb = param_comb log.d("Best parameter configuration is") log.d(best_param_comb)
def run_experiment(): ma_data = "/home/cwieprec/Work/MA.data" precomputed = "{}/precomputed".format(ma_data) xval_num_folds = 4 # list of dictionaries that contain all parameter combinations that will be run param_combs = [{"vw": 2048, "vsp": [[3, 2], [2, 1]], "ow": 512, "osp": [[9, 2], [3, 2]]}] # load annotation data gw_trans_data_pages = __get_gw_trans_data_pages(ma_data) # all word annotations in dataset gw_words = np.array([w.word for p in gw_trans_data_pages for w in p]) num_pages = len(gw_trans_data_pages) accum_page_sizes = [0] + list(mathutils.accumulate([len(p) for p in gw_trans_data_pages])) num_samples = accum_page_sizes[-1] fold_size = num_pages / xval_num_folds best_map = 0.0 best_param_comb = {} for param_comb in param_combs: nonlc = param_comb["ow"] osp = param_comb["osp"] nvisc = param_comb["vw"] vsp = param_comb["vsp"] log.d("Loading and concatenating feature matrices...") gwo_traj_feat_mat = np.loadtxt("{}/{}.txt.gz".format(precomputed,__gwo_param_config_string(nonlc, osp))) vis_feat_mat = np.loadtxt("{}/{}.txt.gz".format(precomputed, __gw_param_config_string(nvisc, vsp))) # build shared feature matrix feat_mat = np.concatenate((vis_feat_mat, gwo_traj_feat_mat), axis=1) stats = [] n_visual_features = vis_feat_mat.shape[1] n_online_features = gwo_traj_feat_mat.shape[1] n_features = n_visual_features + n_online_features for test_fold in range(xval_num_folds): test_pages_indices = range(test_fold * fold_size, test_fold * fold_size + fold_size) test_indices = [wi for pi in test_pages_indices for wi in range(accum_page_sizes[pi], accum_page_sizes[pi+1])] train_indices = [i for i in range(num_samples) if i not in test_indices] log.d("Calculating topic space with {} samples...".format(len(train_indices))) svd = TruncatedSVD(n_components=256) svd.fit(feat_mat[train_indices]) """ Evaluation """ # in contrast to qbs-lsa we want to query all trajectories here corpus_words = gw_words[test_indices] query_words = corpus_words # for test corpus only take visual features corpus_mat = np.zeros(shape=(len(test_indices), n_features)) corpus_mat[:, :n_visual_features] = feat_mat[test_indices, :n_visual_features] corpus_mat = svd.transform(corpus_mat) # for test queries only take trajectory features query_mat = np.zeros(shape=(len(test_indices), n_features)) query_mat[:, n_visual_features:] = feat_mat[test_indices, n_visual_features:] query_mat = svd.transform(query_mat) print("") stats.append(evaluation.run_evaluation(query_mat, query_words, corpus_mat, corpus_words)) print("") log.d("online centroids: {}, online spatial pyramid: {}, visual words: {}, visual spatial pyramid: {}".format(nonlc, osp, nvisc, vsp)) evaluation.log_xval_stats(stats) test_stats = evaluation.get_xval_stats(stats) if test_stats["mAP"] > best_map: best_map = test_stats["mAP"] best_param_comb = param_comb log.d("Best parameter configuration is") log.d(best_param_comb)
def run_experiment(): ma_data = "/home/chris/Work/MA.data" precomputed = "{}/precomputed".format(ma_data) xval_num_folds = 4 # list of dictionaries that contain all parameter combinations that will be run param_combs = [{"vw": 2048, "vsp": [[2, 1], [1, 1]], "ow": 128, "osp": [[9, 2], [3, 2]]}] # load annotation data gw_trans_data_pages = __get_gw_trans_data_pages(ma_data) # all word annotations in dataset gw_words = np.array([w.word for p in gw_trans_data_pages for w in p]) log.d("Building textual descriptors...") phoc_feature_generator = PhocFeatureGenerator(gw_trans_data_pages) gw_text_feat_mat = phoc_feature_generator.build_textual_feature_vectors_matrix() num_pages = len(gw_trans_data_pages) accum_page_sizes = [0] + list(mathutils.accumulate([len(p) for p in gw_trans_data_pages])) num_samples = accum_page_sizes[-1] fold_size = num_pages / xval_num_folds best_map = 0.0 best_param_comb = {} for param_comb in param_combs: nonlc = param_comb["ow"] osp = param_comb["osp"] nvisc = param_comb["vw"] vsp = param_comb["vsp"] gwo_traj_feat_mat = np.loadtxt("{}/{}.txt.gz".format(precomputed,__gwo_param_config_string(nonlc, osp))) vis_feat_mat = np.loadtxt("{}/{}.txt.gz".format(precomputed, __gw_param_config_string(nvisc, vsp))) stats = [] for test_fold in range(xval_num_folds): test_pages_indices = range(test_fold * fold_size, test_fold * fold_size + fold_size) test_indices = [wi for pi in test_pages_indices for wi in range(accum_page_sizes[pi], accum_page_sizes[pi+1])] train_indices = [i for i in range(num_samples) if i not in test_indices] """ Train handwriting SVMs """ traj_svms = AttributesSVMGenerator() traj_svms.fit(gwo_traj_feat_mat[train_indices], gw_text_feat_mat[train_indices], platts=False) """ Train visual SVMs """ svms = AttributesSVMGenerator() svms.fit(vis_feat_mat[train_indices], gw_text_feat_mat[train_indices]) # optionally learn regression which will set transform matrices in the svm objects # learn_regression(svms, traj_svms) """ Evaluation """ corpus_words = gw_words[test_indices] query_words = corpus_words corpus_mat = svms.score(vis_feat_mat[test_indices]) query_mat = traj_svms.score(gwo_traj_feat_mat[test_indices]) stats.append(evaluation.run_evaluation(query_mat, query_words, corpus_mat, corpus_words)) print("") log.d("online centroids: {}, online spatial pyramid: {}, visual words: {}, visual spatial pyramid: {}".format(nonlc, osp, nvisc, vsp)) evaluation.log_xval_stats(stats) test_stats = evaluation.get_xval_stats(stats) if test_stats["mAP"] > best_map: best_map = test_stats["mAP"] best_param_comb = param_comb log.d("Best parameter configuration is") log.d(best_param_comb)
def run_evaluation_with_invocab(query_mat, query_words, corpus_mat, corpus_words, train_vocab, drop_first=False): """ Same as run_evaluation, but also saving separate statistics for in vocabulary queries. """ num_queries = len(query_mat) log.d("Running {} queries...".format(num_queries)) recall = 0.0 precision = 0.0 avg_precision = 0.0 iv_num_queries = 0 iv_recall = 0.0 iv_precision = 0.0 iv_avg_precision = 0.0 up = log.update_progress gtr = get_top_results er = __evaluate_results invalid = 0 for i, query_vec in enumerate(query_mat): word = query_words[i] in_vocab = word in train_vocab top_results = gtr(corpus_mat, query_vec, corpus_words, corpus_mat.shape[0], drop_first=drop_first) query_occurences = sum([1 for w in corpus_words if w == word]) if drop_first: query_occurences -= 1 ap, rec, prec = er(top_results, word, query_occurences, logresults=False) if ap is None: # dont count invalid queries if not in_vocab: invalid += 1 else: if in_vocab: iv_recall += rec iv_precision += prec iv_avg_precision += ap iv_num_queries += 1 recall += rec precision += prec avg_precision += ap up(i + 1, num_queries) print("") num_queries = num_queries - invalid if num_queries > 0: recall /= num_queries precision /= num_queries avg_precision /= num_queries __log_results("Results for {} overall queries:".format(num_queries), recall, precision, avg_precision) else: log.e("No queries.") if iv_num_queries > 0: iv_recall /= iv_num_queries iv_precision /= iv_num_queries iv_avg_precision /= iv_num_queries __log_results( "Results for {} in-vocabulary queries:".format(iv_num_queries), iv_recall, iv_precision, iv_avg_precision) else: log.e("No in vocabulary queries.") return [ num_queries, recall, precision, avg_precision, iv_num_queries, iv_recall, iv_precision, iv_avg_precision ]
def run_experiment(): ma_data = "/home/cwieprec/Work/MA.data" precomputed = "{}/precomputed".format(ma_data) xval_num_folds = 4 # list of dictionaries that contain all parameter combinations that will be run param_combs = [{"vw": 2048, "vsp": [[2, 1], [1, 1]], "ow": 128, "osp": [[9, 2], [3, 2]]}] # load annotation data unipen_trans_data_pages = __get_unipen_trans_data_pages(ma_data) gw_trans_data_pages = __get_gw_trans_data_pages(ma_data) # find words, that are present in both gw and unipen datasets words = np.array([w.word for p in gw_trans_data_pages for w in p]) unipen_words = np.array([w.word for p in unipen_trans_data_pages for w in p]) shared_words = mathutils.remove_duplicates([w for w in unipen_words if w in words]) log.d("{} unique words".format(len(shared_words))) # calculate indices for feature matrices to use only those shared words gw_indices = [i for i, w in enumerate(words) if w in shared_words] gw_words = np.array([w for w in words if w in shared_words]) unipen_indices = [i for i, w in enumerate(unipen_words) if w in shared_words] log.d("Building textual descriptors...") phoc_feature_generator = PhocFeatureGenerator(gw_trans_data_pages) gw_text_feat_mat = phoc_feature_generator.build_textual_feature_vectors_matrix()[gw_indices] unipen_text_feat_mat = phoc_feature_generator.build_textual_feature_vectors_matrix(unipen_trans_data_pages)[unipen_indices] num_pages = len(gw_trans_data_pages) accum_page_sizes = [0] + list(mathutils.accumulate([len([w for w in p if w.word in shared_words]) for p in gw_trans_data_pages])) num_samples = accum_page_sizes[-1] fold_size = num_pages / xval_num_folds best_map = 0.0 best_param_comb = {} traj_svm_cache = {} for param_comb in param_combs: nonlc = param_comb["ow"] osp = param_comb["osp"] nvisc = param_comb["vw"] vsp = param_comb["vsp"] unipen_traj_feat_mat = np.loadtxt("{}/{}.txt.gz".format(precomputed,__unipen_param_config_string(nonlc, osp)))[unipen_indices] gwo_traj_feat_mat = np.loadtxt("{}/{}.txt.gz".format(precomputed,__gwo_param_config_string(nonlc, osp)))[gw_indices] """ Train/Load online-handwriting SVMs """ # unipen is always used as train set as a whole svm_key = str(nonlc) + str(osp) if svm_key in traj_svm_cache: traj_svms = traj_svm_cache[svm_key] else: traj_svms = AttributesSVMGenerator() traj_svms.fit(unipen_traj_feat_mat, unipen_text_feat_mat, platts=False) traj_svm_cache[svm_key] = traj_svms vis_feat_mat = np.loadtxt("{}/{}.txt.gz".format(precomputed, __gw_param_config_string(nvisc, vsp)))[gw_indices] stats = [] for test_fold in range(xval_num_folds): test_pages_indices = range(test_fold * fold_size, test_fold * fold_size + fold_size) test_indices = [wi for pi in test_pages_indices for wi in range(accum_page_sizes[pi], accum_page_sizes[pi+1])] train_indices = [i for i in range(num_samples) if i not in test_indices] """ Train visual SVMs """ svms = AttributesSVMGenerator() svms.fit(vis_feat_mat[train_indices], gw_text_feat_mat[train_indices]) # optionally learn regression which will set transform matrices in the svm objects # learn_regression(svms, traj_svms) """ Evaluation """ corpus_words = gw_words[test_indices] query_words = corpus_words corpus_mat = svms.score(vis_feat_mat[test_indices]) query_mat = traj_svms.score(gwo_traj_feat_mat[test_indices]) stats.append(evaluation.run_evaluation(query_mat, query_words, corpus_mat, corpus_words)) print("") log.d("online centroids: {}, online spatial pyramid: {}, visual words: {}, visual spatial pyramid: {}".format(nonlc, osp, nvisc, vsp)) evaluation.log_xval_stats(stats) test_stats = evaluation.get_xval_stats(stats) if test_stats["mAP"] > best_map: best_map = test_stats["mAP"] best_param_comb = param_comb log.d("Best parameter configuration is") log.d(best_param_comb)
def __learn_sigmoid_params(self, data_mat): log.d("Platt's scaling (visual feature SVMs)...") class_labels = svm.predict_embedded_attributes_labels(data_mat, self.attribute_svms) self.sigmoid_params = platt.learn_platts_scaling_params(self.train_scores, class_labels) del class_labels
def run_evaluation_with_invocab(query_mat, query_words, corpus_mat, corpus_words, train_vocab, drop_first=False): """ Same as run_evaluation, but also saving separate statistics for in vocabulary queries. """ num_queries = len(query_mat) log.d("Running {} queries...".format(num_queries)) recall = 0.0 precision = 0.0 avg_precision = 0.0 iv_num_queries = 0 iv_recall = 0.0 iv_precision = 0.0 iv_avg_precision = 0.0 up = log.update_progress gtr = get_top_results er = __evaluate_results invalid = 0 for i, query_vec in enumerate(query_mat): word = query_words[i] in_vocab = word in train_vocab top_results = gtr(corpus_mat, query_vec, corpus_words, corpus_mat.shape[0], drop_first=drop_first) query_occurences = sum([1 for w in corpus_words if w == word]) if drop_first: query_occurences -= 1 ap, rec, prec = er(top_results, word, query_occurences, logresults=False) if ap is None: # dont count invalid queries if not in_vocab: invalid += 1 else: if in_vocab: iv_recall += rec iv_precision += prec iv_avg_precision += ap iv_num_queries += 1 recall += rec precision += prec avg_precision += ap up(i+1, num_queries) print("") num_queries = num_queries - invalid if num_queries > 0: recall /= num_queries precision /= num_queries avg_precision /= num_queries __log_results("Results for {} overall queries:".format(num_queries), recall, precision, avg_precision) else: log.e("No queries.") if iv_num_queries > 0: iv_recall /= iv_num_queries iv_precision /= iv_num_queries iv_avg_precision /= iv_num_queries __log_results("Results for {} in-vocabulary queries:".format(iv_num_queries), iv_recall, iv_precision, iv_avg_precision) else: log.e("No in vocabulary queries.") return [num_queries, recall, precision, avg_precision, iv_num_queries, iv_recall, iv_precision, iv_avg_precision]