def __init__(self, models_to_load=[], device='cpu'):
     super(FullModelInterface, self).__init__()
     self.models = models_to_load
     self.dps = {
         k:DefaultProcessor(
             model_info[k][0],
             Hierarchy.from_dict(read_pickle(os.path.join(model_info[k][1], 'hierarchy.pkl')))\
                 if model_info[k][1] is not None else self.hierarchy,
             model_file=os.path.join(model_info[k][1], 'model_state.tpkl') if model_info[k][1] is not None else None,
             device=device,
             cluster=True)
         for k in self.models}
     self.trained_queries = {k:get_queries(os.path.join(model_info[k][1], 'used_targets.txt'))
                             if model_info[k][1] is not None else list(self.hierarchy.descriptions.keys())
                             for k in self.models}
def main():
    img_files = glob.glob("oxbuild-images/*.jpg")
    # img_names = ['all_souls_000000', ...]
    # query_indices = [11, 21, ...]
    img_names = utils.get_images_names(img_files)
    query_indices = utils.get_queries(img_names)
    files_for_codebook = img_files
    for index in query_indices:
        del files_for_codebook[index]
        del img_names[index]
    img_names = utils.get_images_names(img_files)

    land_marks = [
        'all_souls', 'ashmolean', 'balliol', 'bodleian', 'christ_church',
        'cornmarket', 'hertford', 'keble', 'magdalen', 'pitt_rivers',
        'radcliffe_camera'
    ]

    #64
    print("leyendo codebook...")
    codebook = np.loadtxt("clusters64.csv", delimiter=",")
    print("leyendo vlad matrix...")
    vlad = np.loadtxt("vlad64.csv", delimiter=",")
    print("listo")
    print("vlad matrix shape: ")
    print(vlad.shape)
    test = Test(vlad, codebook, img_names, "euclidean")
    precisions = []
    for lm in land_marks:
        for i in range(5):
            index = str(i + 1)
            precision = test.do_query(lm + "_" + index)
            precisions.append(precision)
    print("64 euclidean map = "),
    print(np.average(precisions))

    test = Test(vlad, codebook, img_names, "hellinger")
    precisions = []
    for lm in land_marks:
        for i in range(5):
            index = str(i + 1)
            precision = test.do_query(lm + "_" + index)
            precisions.append(precision)
    print("64 hellinger map = "),
    print(np.average(precisions))
def main():
    img_files = glob.glob("oxbuild-images/*.jpg")
    # img_names = ['all_souls_000000', ...]
    # query_indices = [11, 21, ...]
    img_names = utils.get_images_names(img_files)
    query_indices = utils.get_queries(img_names)
    files_for_codebook = img_files
    for index in query_indices:
        del files_for_codebook[index]
        del img_names[index]
    img_names = utils.get_images_names(img_files)

    land_marks = ['all_souls','ashmolean','balliol','bodleian','christ_church','cornmarket','hertford','keble','magdalen','pitt_rivers','radcliffe_camera']

#64
    print("leyendo codebook...")
    codebook = np.loadtxt("clusters64.csv", delimiter=",")
    print("leyendo vlad matrix...")
    vlad = np.loadtxt("vlad64.csv", delimiter=",")
    print("listo")
    print("vlad matrix shape: ")
    print(vlad.shape)
    test = Test(vlad,codebook,img_names,"euclidean")
    precisions = []
    for lm in land_marks:
        for i in range(5):
            index = str(i+1)
            precision = test.do_query(lm + "_" + index)
            precisions.append(precision)
    print("64 euclidean map = "),
    print(np.average(precisions))

    test = Test(vlad,codebook,img_names,"hellinger")
    precisions = []
    for lm in land_marks:
        for i in range(5):
            index = str(i+1)
            precision = test.do_query(lm + "_" + index)
            precisions.append(precision)
    print("64 hellinger map = "),
    print(np.average(precisions))
    print("Reading training data:")
    print("[First]:\nRead label files to relations...")
    relations, relation_labeler = read_lablers_to_relations(
        config_data["labels"])

    print("[Second]:\nSet relations as train instances...")

    print("Reading data index ...")
    index = pyndri.Index(config_data["index"])
    token2id, _, _ = index.get_dictionary()
    externalDocId = {}
    for doc_id in range(index.document_base(),
                        index.maximum_document()):  # type: int
        extD_id, _ = index.document(doc_id)
        externalDocId[extD_id] = doc_id
    train_queries = get_queries(config_data["train_queries"])

    print("x_train preparation...")
    # the model needs list of 3 input arrays :
    v_q_words = []
    v_d_words = []
    v_rel_labels = []

    # print(train_queries)
    print(list(relations)[0])

    for relation in tqdm(relations):
        # get q_word_ids from index
        q_words = [
            token2id[qi] if qi in token2id else 0
            for qi in train_queries[relation[0]].strip().split()
Exemple #5
0
def main():
    # img_files = ['oxbuild-images/all_souls_000000.jpg', ...]
    img_files = glob.glob("oxbuild-images/*.jpg")
    # img_names = ['all_souls_000000', ...]
    img_names = utils.get_images_names(img_files)
    # query_indices = [11, 21, ...]
    query_indices = utils.get_queries(img_names)
    files_for_codebook = img_files
    for index in query_indices:
        del files_for_codebook[index]
        del img_names[index]

    # Training
    #---------------------------------------------------------------------------
    # Extracting descriptors
    #
    # start = time()
    # descriptors_count = train.calculate_descriptors(files_for_codebook)
    # end = time()
    # elapsed_time = utils.humanize_time(end - start)
    # print("Elapsed time getting the descriptors {0}.".format(elapsed_time))
    #
    # #Get the sample of 100k descriptors
    # start = time()
    # sample = train.get_sample()
    # end = time()
    # elapsed_time = utils.humanize_time(end - start)
    # print("Elapsed time getting the sample {0}.".format(elapsed_time))
    #
    # # Clustering
    #
    # k = 64
    # start = time()
    # clusters = train.get_clusters(k, sample)
    # end = time()
    # elapsed_time = utils.humanize_time(end - start)
    # print("Elapsed time clustering for k={0} {1}".format(k, elapsed_time))
    # np.savetxt("clusters64.csv", clusters, delimiter=",")
    #
    # k = 128
    # start = time()
    # clusters = train.get_clusters(k, sample)
    # end = time()
    # elapsed_time = utils.humanize_time(end - start)
    # print("Elapsed time clustering for k={0} {1}".format(k, elapsed_time))
    # np.savetxt("clusters128.csv", clusters, delimiter=",")
    #
    # k = 256
    # start = time()
    # clusters = train.get_clusters(k, sample)
    # end = time()
    # elapsed_time = utils.humanize_time(end - start)
    # print("Elapsed time clustering for k={0} {1}".format(k, elapsed_time))
    # np.savetxt("clusters256.csv", clusters, delimiter=",")

    # Vlad
    #k = 64
    # clusters = np.loadtxt("clusters64.csv", delimiter=",")
    # vlad = Vlad(clusters, 64)
    # vlad_matrix = None
    # i = 0
    # start = time()
    # for image_path in files_for_codebook:
    #     print(str(i) +  "/" + str(len(files_for_codebook)))
    #     descriptors = train.get_descriptor_from_image_path(image_path)
    #
    #     vlad_imagen = vlad.get_image_vlad(descriptors)
    #     if vlad_matrix is None:
    #         vlad_matrix = vlad_imagen
    #     else:
    #         vlad_matrix = np.vstack((vlad_matrix,vlad_imagen))
    #     i += 1
    # end = time()
    # elapsed_time = utils.humanize_time(end - start)
    # print("Elapsed time vlad {0}.".format(elapsed_time))
    # np.savetxt("vlad64.csv",vlad_matrix,delimiter=",")
    #
    # #k = 256
    # clusters = np.loadtxt("clusters256.csv", delimiter=",")
    # vlad = Vlad(clusters, 64)
    # vlad_matrix = None
    # i = 0
    # start = time()
    # for image_path in files_for_codebook:
    #     print(str(i) +  "/" + str(len(files_for_codebook)))
    #     descriptors = train.get_descriptor_from_image_path(image_path)
    #
    #     vlad_imagen = vlad.get_image_vlad(descriptors)
    #     if vlad_matrix is None:
    #         vlad_matrix = vlad_imagen
    #     else:
    #         vlad_matrix = np.vstack((vlad_matrix,vlad_imagen))
    #     i += 1
    # end = time()
    # elapsed_time = utils.humanize_time(end - start)
    # print("Elapsed time vlad {0}.".format(elapsed_time))
    # np.savetxt("vlad256.csv",vlad_matrix,delimiter=",")

    # Testing
    #---------------------------------------------------------------------------

    # Hacer queries
    #64
    query_name = "hertford_4"
    print("leyendo codebook de 64 clusters...")
    codebook = np.loadtxt("clusters64.csv", delimiter=",")
    print("leyendo vlad matrix...")
    vlad = np.loadtxt("vlad64.csv", delimiter=",")
    print("listo")
    test = Test(vlad, codebook, img_names, "euclidean")
    ranking = test.do_query_and_get_ranking(query_name)
    print("ranking: ")
    print(ranking)
    file_name = "ranking_" + query_name + "_64"
    list_to_file(ranking.tolist(), file_name)
    #128
    print("leyendo codebook de 128 clusters...")
    codebook = np.loadtxt("clusters128.csv", delimiter=",")
    print("leyendo vlad matrix...")
    vlad = np.loadtxt("vlad128.csv", delimiter=",")
    print("listo")
    test = Test(vlad, codebook, img_names, "euclidean")
    ranking = test.do_query_and_get_ranking(query_name)
    file_name = "ranking_" + query_name + "_128"
    list_to_file(ranking.tolist(), file_name)
    #255
    print("leyendo codebook de 256 clusters...")
    codebook = np.loadtxt("clusters256.csv", delimiter=",")
    print("leyendo vlad matrix...")
    vlad = np.loadtxt("vlad256.csv", delimiter=",")
    print("listo")
    test = Test(vlad, codebook, img_names, "euclidean")
    ranking = test.do_query_and_get_ranking(query_name)
    file_name = "ranking_" + query_name + "_256"
    list_to_file(ranking.tolist(), file_name)
Exemple #6
0
from sklearn import base
from google.cloud import bigquery
import utils as u

queries = u.get_queries('queries')
bq_client = bigquery.Client()


class PostCodeExtractor(base.TransformerMixin, base.BaseEstimator):
    '''Extract the post code from the address column
    and drop the address column'''
    def __inint__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_c = X.copy()
        X_c['post_code'] = X_c['address'].str.extract(r'.*(\d{4}).*',
                                                      expand=False)
        return X_c


class PostCodeEnricher(base.TransformerMixin, base.BaseEstimator):
    '''Extract the post code data from bigquery,
    merge it with the training data.'''
    def __init__(self):
        pass

    def fit(self, X, y=None):
Exemple #7
0
import glob
import utils

img_files = glob.glob("oxbuild-images/*.jpg")
img_names = utils.get_images_names(img_files)
query_indices = utils.get_queries(img_names)
files_for_codebook = img_files
for index in query_indices:
    del files_for_codebook[index]
print len(files_for_codebook)
import glob
import utils

img_files = glob.glob("oxbuild-images/*.jpg")
img_names = utils.get_images_names(img_files)
query_indices = utils.get_queries(img_names)
files_for_codebook = img_files
for index in query_indices:
	del files_for_codebook[index]
print len(files_for_codebook)
Exemple #9
0
import os
from pytt.utils import read_pickle
from utils import get_queries

dataset = '/home/jered/Documents/data/mimic-iii-clinical-database-1.4/preprocessed/reports_and_codes_expanded'
# need to add support for ancestors
# code_graph_file =
# ancestors = True
rebalanced = True
counts_file = os.path.join(dataset, 'counts.pkl')
used_targets_file = os.path.join(dataset, 'used_targets.txt')

used_targets = get_queries(used_targets_file)
counts = read_pickle(counts_file)

micro_counts = [[], [], []]
macro_scores = [[], [], []]

for k, v in counts.items():
    if k not in used_targets: continue
    total = v[0] + v[1]
    true_positives = v[1] / 2 if rebalanced else v[1] * v[1] / total
    micro_counts[0] += [true_positives]
    positives = total / 2 if rebalanced else v[1]
    micro_counts[1] += [positives]
    relevants = v[1]
    micro_counts[2] += [relevants]
    if positives != 0:
        p = true_positives / positives
        macro_scores[0] += [p]
    if relevants != 0: