def __init__(self, models_to_load=[], device='cpu'): super(FullModelInterface, self).__init__() self.models = models_to_load self.dps = { k:DefaultProcessor( model_info[k][0], Hierarchy.from_dict(read_pickle(os.path.join(model_info[k][1], 'hierarchy.pkl')))\ if model_info[k][1] is not None else self.hierarchy, model_file=os.path.join(model_info[k][1], 'model_state.tpkl') if model_info[k][1] is not None else None, device=device, cluster=True) for k in self.models} self.trained_queries = {k:get_queries(os.path.join(model_info[k][1], 'used_targets.txt')) if model_info[k][1] is not None else list(self.hierarchy.descriptions.keys()) for k in self.models}
def main(): img_files = glob.glob("oxbuild-images/*.jpg") # img_names = ['all_souls_000000', ...] # query_indices = [11, 21, ...] img_names = utils.get_images_names(img_files) query_indices = utils.get_queries(img_names) files_for_codebook = img_files for index in query_indices: del files_for_codebook[index] del img_names[index] img_names = utils.get_images_names(img_files) land_marks = [ 'all_souls', 'ashmolean', 'balliol', 'bodleian', 'christ_church', 'cornmarket', 'hertford', 'keble', 'magdalen', 'pitt_rivers', 'radcliffe_camera' ] #64 print("leyendo codebook...") codebook = np.loadtxt("clusters64.csv", delimiter=",") print("leyendo vlad matrix...") vlad = np.loadtxt("vlad64.csv", delimiter=",") print("listo") print("vlad matrix shape: ") print(vlad.shape) test = Test(vlad, codebook, img_names, "euclidean") precisions = [] for lm in land_marks: for i in range(5): index = str(i + 1) precision = test.do_query(lm + "_" + index) precisions.append(precision) print("64 euclidean map = "), print(np.average(precisions)) test = Test(vlad, codebook, img_names, "hellinger") precisions = [] for lm in land_marks: for i in range(5): index = str(i + 1) precision = test.do_query(lm + "_" + index) precisions.append(precision) print("64 hellinger map = "), print(np.average(precisions))
def main(): img_files = glob.glob("oxbuild-images/*.jpg") # img_names = ['all_souls_000000', ...] # query_indices = [11, 21, ...] img_names = utils.get_images_names(img_files) query_indices = utils.get_queries(img_names) files_for_codebook = img_files for index in query_indices: del files_for_codebook[index] del img_names[index] img_names = utils.get_images_names(img_files) land_marks = ['all_souls','ashmolean','balliol','bodleian','christ_church','cornmarket','hertford','keble','magdalen','pitt_rivers','radcliffe_camera'] #64 print("leyendo codebook...") codebook = np.loadtxt("clusters64.csv", delimiter=",") print("leyendo vlad matrix...") vlad = np.loadtxt("vlad64.csv", delimiter=",") print("listo") print("vlad matrix shape: ") print(vlad.shape) test = Test(vlad,codebook,img_names,"euclidean") precisions = [] for lm in land_marks: for i in range(5): index = str(i+1) precision = test.do_query(lm + "_" + index) precisions.append(precision) print("64 euclidean map = "), print(np.average(precisions)) test = Test(vlad,codebook,img_names,"hellinger") precisions = [] for lm in land_marks: for i in range(5): index = str(i+1) precision = test.do_query(lm + "_" + index) precisions.append(precision) print("64 hellinger map = "), print(np.average(precisions))
print("Reading training data:") print("[First]:\nRead label files to relations...") relations, relation_labeler = read_lablers_to_relations( config_data["labels"]) print("[Second]:\nSet relations as train instances...") print("Reading data index ...") index = pyndri.Index(config_data["index"]) token2id, _, _ = index.get_dictionary() externalDocId = {} for doc_id in range(index.document_base(), index.maximum_document()): # type: int extD_id, _ = index.document(doc_id) externalDocId[extD_id] = doc_id train_queries = get_queries(config_data["train_queries"]) print("x_train preparation...") # the model needs list of 3 input arrays : v_q_words = [] v_d_words = [] v_rel_labels = [] # print(train_queries) print(list(relations)[0]) for relation in tqdm(relations): # get q_word_ids from index q_words = [ token2id[qi] if qi in token2id else 0 for qi in train_queries[relation[0]].strip().split()
def main(): # img_files = ['oxbuild-images/all_souls_000000.jpg', ...] img_files = glob.glob("oxbuild-images/*.jpg") # img_names = ['all_souls_000000', ...] img_names = utils.get_images_names(img_files) # query_indices = [11, 21, ...] query_indices = utils.get_queries(img_names) files_for_codebook = img_files for index in query_indices: del files_for_codebook[index] del img_names[index] # Training #--------------------------------------------------------------------------- # Extracting descriptors # # start = time() # descriptors_count = train.calculate_descriptors(files_for_codebook) # end = time() # elapsed_time = utils.humanize_time(end - start) # print("Elapsed time getting the descriptors {0}.".format(elapsed_time)) # # #Get the sample of 100k descriptors # start = time() # sample = train.get_sample() # end = time() # elapsed_time = utils.humanize_time(end - start) # print("Elapsed time getting the sample {0}.".format(elapsed_time)) # # # Clustering # # k = 64 # start = time() # clusters = train.get_clusters(k, sample) # end = time() # elapsed_time = utils.humanize_time(end - start) # print("Elapsed time clustering for k={0} {1}".format(k, elapsed_time)) # np.savetxt("clusters64.csv", clusters, delimiter=",") # # k = 128 # start = time() # clusters = train.get_clusters(k, sample) # end = time() # elapsed_time = utils.humanize_time(end - start) # print("Elapsed time clustering for k={0} {1}".format(k, elapsed_time)) # np.savetxt("clusters128.csv", clusters, delimiter=",") # # k = 256 # start = time() # clusters = train.get_clusters(k, sample) # end = time() # elapsed_time = utils.humanize_time(end - start) # print("Elapsed time clustering for k={0} {1}".format(k, elapsed_time)) # np.savetxt("clusters256.csv", clusters, delimiter=",") # Vlad #k = 64 # clusters = np.loadtxt("clusters64.csv", delimiter=",") # vlad = Vlad(clusters, 64) # vlad_matrix = None # i = 0 # start = time() # for image_path in files_for_codebook: # print(str(i) + "/" + str(len(files_for_codebook))) # descriptors = train.get_descriptor_from_image_path(image_path) # # vlad_imagen = vlad.get_image_vlad(descriptors) # if vlad_matrix is None: # vlad_matrix = vlad_imagen # else: # vlad_matrix = np.vstack((vlad_matrix,vlad_imagen)) # i += 1 # end = time() # elapsed_time = utils.humanize_time(end - start) # print("Elapsed time vlad {0}.".format(elapsed_time)) # np.savetxt("vlad64.csv",vlad_matrix,delimiter=",") # # #k = 256 # clusters = np.loadtxt("clusters256.csv", delimiter=",") # vlad = Vlad(clusters, 64) # vlad_matrix = None # i = 0 # start = time() # for image_path in files_for_codebook: # print(str(i) + "/" + str(len(files_for_codebook))) # descriptors = train.get_descriptor_from_image_path(image_path) # # vlad_imagen = vlad.get_image_vlad(descriptors) # if vlad_matrix is None: # vlad_matrix = vlad_imagen # else: # vlad_matrix = np.vstack((vlad_matrix,vlad_imagen)) # i += 1 # end = time() # elapsed_time = utils.humanize_time(end - start) # print("Elapsed time vlad {0}.".format(elapsed_time)) # np.savetxt("vlad256.csv",vlad_matrix,delimiter=",") # Testing #--------------------------------------------------------------------------- # Hacer queries #64 query_name = "hertford_4" print("leyendo codebook de 64 clusters...") codebook = np.loadtxt("clusters64.csv", delimiter=",") print("leyendo vlad matrix...") vlad = np.loadtxt("vlad64.csv", delimiter=",") print("listo") test = Test(vlad, codebook, img_names, "euclidean") ranking = test.do_query_and_get_ranking(query_name) print("ranking: ") print(ranking) file_name = "ranking_" + query_name + "_64" list_to_file(ranking.tolist(), file_name) #128 print("leyendo codebook de 128 clusters...") codebook = np.loadtxt("clusters128.csv", delimiter=",") print("leyendo vlad matrix...") vlad = np.loadtxt("vlad128.csv", delimiter=",") print("listo") test = Test(vlad, codebook, img_names, "euclidean") ranking = test.do_query_and_get_ranking(query_name) file_name = "ranking_" + query_name + "_128" list_to_file(ranking.tolist(), file_name) #255 print("leyendo codebook de 256 clusters...") codebook = np.loadtxt("clusters256.csv", delimiter=",") print("leyendo vlad matrix...") vlad = np.loadtxt("vlad256.csv", delimiter=",") print("listo") test = Test(vlad, codebook, img_names, "euclidean") ranking = test.do_query_and_get_ranking(query_name) file_name = "ranking_" + query_name + "_256" list_to_file(ranking.tolist(), file_name)
from sklearn import base from google.cloud import bigquery import utils as u queries = u.get_queries('queries') bq_client = bigquery.Client() class PostCodeExtractor(base.TransformerMixin, base.BaseEstimator): '''Extract the post code from the address column and drop the address column''' def __inint__(self): pass def fit(self, X, y=None): return self def transform(self, X): X_c = X.copy() X_c['post_code'] = X_c['address'].str.extract(r'.*(\d{4}).*', expand=False) return X_c class PostCodeEnricher(base.TransformerMixin, base.BaseEstimator): '''Extract the post code data from bigquery, merge it with the training data.''' def __init__(self): pass def fit(self, X, y=None):
import glob import utils img_files = glob.glob("oxbuild-images/*.jpg") img_names = utils.get_images_names(img_files) query_indices = utils.get_queries(img_names) files_for_codebook = img_files for index in query_indices: del files_for_codebook[index] print len(files_for_codebook)
import os from pytt.utils import read_pickle from utils import get_queries dataset = '/home/jered/Documents/data/mimic-iii-clinical-database-1.4/preprocessed/reports_and_codes_expanded' # need to add support for ancestors # code_graph_file = # ancestors = True rebalanced = True counts_file = os.path.join(dataset, 'counts.pkl') used_targets_file = os.path.join(dataset, 'used_targets.txt') used_targets = get_queries(used_targets_file) counts = read_pickle(counts_file) micro_counts = [[], [], []] macro_scores = [[], [], []] for k, v in counts.items(): if k not in used_targets: continue total = v[0] + v[1] true_positives = v[1] / 2 if rebalanced else v[1] * v[1] / total micro_counts[0] += [true_positives] positives = total / 2 if rebalanced else v[1] micro_counts[1] += [positives] relevants = v[1] micro_counts[2] += [relevants] if positives != 0: p = true_positives / positives macro_scores[0] += [p] if relevants != 0: