class DataPreProcessor:
    def __init__(self):
        """
        1) Read each image from data dir and put metadata in database table.
        2) Pass that image to each feature model to get a vector of dimension (1 * m)
        3) Pass this vector to all the dimension reduction technique to get latent semantics.
        4) Put this latent semantic of each image from each model and technique with its metadata inside database
        """
        self.root_path = str(Path(str(Path(os.getcwd()).parent))) + "/Data/"
        self.database_connection = DatabaseConnection()

        self.process_metadata()
        self.process_classification_metadata()

        self.DATABASE_IMAGES_PATH = self.root_path + "/images"
        self.CLASSIFICATION_IMAGES_PATH = self.root_path + "/phase3_sample_data"
        # feature_models = []
        # feature_models.append("histogram_of_gradients")
        # feature_models.append("sift")
        # # feature_models.append("histogram_of_gradients_30")
        #
        # processes = []
        # for i, feature in enumerate(feature_models):
        #     processes.append(Process(target=self.perform_feature_model(feature)))
        #     processes[i].start()
        #
        # for i in range(len(processes)):
        #     processes[i].join()

        # classification specific

        charas = ["Labelled", "Unlabelled"]
        # sets = ["Set1", "Set2", "Set3", "Set4"]
        sets = ["Set1", "Set2", "Set3"]
        number_of_clusters = ['250', '300']
        path = []
        feature_models = []
        for chara_ in charas:
            for set_ in sets:
                for cluster_count in number_of_clusters:
                    path.append(self.CLASSIFICATION_IMAGES_PATH + "/" + chara_ + "/" + set_)
                    # feature_models.append("histogram_of_gradients" + "_" + chara_ + "_" + set_)
                    # feature_models.append("local_binary_pattern" + "_" + chara_ + "_" + set_)
                    feature_models.append("sift" + "_" + chara_ + "_" + set_ + "_" + cluster_count)

        processes = []
        for i, feature in enumerate(feature_models):
            processes.append(Process(target=self.perform_classification_feature_model(feature, path[i],
                                                                                      cluster_count)))
            processes[i].start()
        for i in range(len(processes)):
            processes[i].join()

    # This function will read all the metadata of input images and put those metadata details in database.
    def process_metadata(self):
        csv_file_path = self.root_path + "/HandInfo.csv"
        connection = self.database_connection.get_db_connection()
        cursor = connection.cursor()
        cursor.execute("""DROP Table IF EXISTS metadata;""")
        # Create metadata table
        cursor.execute("CREATE TABLE IF NOT EXISTS metadata( \n"
                       "                        id INT NOT NULL, \n"
                       "                        age INT NOT NULL, \n"
                       "                        gender TEXT NOT NULL, \n"
                       "                        skinColor TEXT NOT NULL, accessories INT NOT NULL,\n"
                       "                        nailPolish INT NOT NULL,\n"
                       "                        aspectOfHand TEXT NOT NULL,\n"
                       "                        imageName TEXT NOT NULL,\n"
                       "                        irregularities INT NOT NULL,\n"
                       "                        PRIMARY KEY (imageName)\n"
                       "                        );\n"
                       "                        ")
        # file opened to avoid permission error in linux
        with open(csv_file_path, 'r') as f:
            next(f)
            cursor.copy_from(f, 'metadata', sep=',', null='')
        # cursor.execute("""copy metadata from '{}' csv header;""".format(csv_file_path))
        connection.commit()

    def process_classification_metadata(self):

        # metadata_files = ['labelled_set1.csv', 'labelled_set2.csv', 'unlabelled_set1.csv', 'unlabelled_set2.csv']
        data_folder = self.root_path + "/phase3_sample_data"
        extension = 'csv'
        os.chdir(data_folder)
        metadata_files = [os.path.basename(x) for x in glob.glob('*.{}'.format(extension))]

        connection = self.database_connection.get_db_connection()

        cursor = connection.cursor()

        for metadata_file in metadata_files:
            table_name = "metadata_" + metadata_file.split('.')[0]
            metadata_file_path = data_folder + "/" + metadata_file

            cursor.execute("DROP Table IF EXISTS " + table_name + ";")

            cursor.execute("""CREATE TABLE IF NOT EXISTS """ + table_name + """(
                            some_number INT,
                            id INT NOT NULL,
                            age INT NOT NULL,
                            gender TEXT NOT NULL,
                            skinColor TEXT NOT NULL,
                            accessories INT NOT NULL,
                            nailPolish INT NOT NULL,
                            aspectOfHand TEXT,
                            imageName TEXT NOT NULL,
                            irregularities INT NOT NULL,
                            PRIMARY KEY (imageName)
                            );
                            """)
            # file opened to avoid permission error in linux
            with open(metadata_file_path, 'r') as f:
                next(f)
                cursor.copy_from(f, table_name, sep=',', null='')
            # cursor.execute("""copy """ + table_name + """ from '{}' csv header;""".format(metadata_file_path))
            connection.commit()

    def perform_feature_model(self, feature):
        if feature == 'sift':
            sift = SIFT(self.DATABASE_IMAGES_PATH)
            sift.read_and_clusterize(num_cluster=150)
            feature_vectors = sift.calculate_centroids_histogram()
        else:
            histogram_of_gradients = HistogramOfGradients(self.DATABASE_IMAGES_PATH)
            feature_vectors = histogram_of_gradients.get_image_vectors()

        self.database_connection.create_feature_model_table(feature)
        self.database_connection.insert_feature_data(feature, feature_vectors)

    def perform_classification_feature_model(self, feature, path, cluster_count):
        self.database_connection.create_feature_model_table(feature)
        if "histogram_of_gradients" in feature:
            histogram_of_gradients = HistogramOfGradients(path)
            feature_vectors = histogram_of_gradients.get_image_vectors()
        elif "local_binary_pattern" in feature:
            local_binary_pattern = LocalBinaryPattern(path)
            feature_vectors = local_binary_pattern.get_image_vectors()
        elif "sift" in feature:
            sift = SIFT(path)
            sift.read_and_clusterize(num_cluster=int(cluster_count))
            feature_vectors = sift.calculate_centroids_histogram()
        self.database_connection.insert_feature_data(feature, feature_vectors)
class RelevanceFeedback:
    def __init__(self):
        self.database_connection = DatabaseConnection()
        self.conn = self.database_connection.get_db_connection()
        print('Initiating RelevanceFeedback....')

    def compute_new_query_vector(self, q_old, relevant_items, irrel_items, alpha=0.3, beta=0.65, gamma=0.05):
        print('Computing new query vector.....')

        avg_rel_vec = np.zeros(q_old.shape)
        avg_irl_vec = np.zeros(q_old.shape)

        # Aggregating relevant items
        for item in relevant_items:
            vector = self.database_connection.get_feature_data_for_image('histogram_of_gradients', item)
            avg_rel_vec = avg_rel_vec + vector

        # Aggregating irrelevant items
        for item in irrel_items:
            vector = self.database_connection.get_feature_data_for_image('histogram_of_gradients', item)
            avg_irl_vec = avg_irl_vec + vector

        if len(relevant_items) != 0:
            avg_rel_vec = avg_rel_vec / len(relevant_items)

        if len(irrel_items) != 0:
            avg_irl_vec = avg_irl_vec / len(irrel_items)

        q_new = alpha * q_old + beta * avg_rel_vec - gamma * avg_irl_vec
        return q_new

    def get_user_feedback(self, init_rank_list, q_name, caller='misc'):
        print('Taking user feedback now...')
        rel_items = []
        irl_items = []

        if caller == 'prb':
            for item in init_rank_list[0]:
                if item[0] == q_name:
                    continue
                else:
                    print(f'Is image {item[0]} relevant ? (y/n)')
                    if input() is 'y':
                        rel_items.append(item[0])
                    else:
                        irl_items.append(item[0])
        else:
            for item in init_rank_list:
                if item[0] == q_name:
                    continue
                else:
                    print(f'Is image {item[0]} relevant ? (y/n)')
                    if input() is 'y':
                        rel_items.append(item[0])
                    else:
                        irl_items.append(item[0])

        return rel_items, irl_items

    def get_SVM_based_feedback(self, q, rel_items, irl_items, obj_feature_matrix, m):
        q_new = self.compute_new_query_vector(q_old=q, relevant_items=rel_items, irrel_items=irl_items)
        X_train, Y_train = self.create_X_Y_as_np_matrix(rel_items=rel_items, irl_items=irl_items)

        # Training SVM classifier
        svm = support_vector_machine.SupportVectorMachine()
        svm.fit(X=X_train, y=Y_train)

        # Now getting more test data from LSH indexes
        test_dataset = read_from_pickle('test_dataset.pickle')
        X_test, imageNames = self.create_X_test_as_np_matrix(test_dataset=test_dataset)
        Y_pred = svm.predict(u=X_test)
        relevant_pred_img_names = [imageNames[i] for i in range(0, len(Y_pred)) if Y_pred[i] == 1]
        length_relevant_images = len(relevant_pred_img_names)
        if length_relevant_images < m:
            irr_image_names = [imageNames[i] for i in range(0, m - length_relevant_images) if Y_pred[i] == -1]
            relevant_pred_img_names.extend(irr_image_names)
        new_obj_feature_matrix = self.database_connection.HOG_descriptor_from_image_ids(
            image_ids=relevant_pred_img_names)

        new_rank_list = get_most_m_similar_images(data_with_images=new_obj_feature_matrix,
                                                  query_image_feature_vector=q_new, m=m)
        return new_rank_list

    def get_DTC_based_feedback(self, q, rel_items, irl_items, obj_feature_matrix, m):
        q_new = self.compute_new_query_vector(q_old=q, relevant_items=rel_items, irrel_items=irl_items)
        X_train, Y_train = self.create_X_Y_as_np_matrix(rel_items=rel_items, irl_items=irl_items)

        # Training SVM classifier
        dtl = decision_tree_learning.DecisionTreeLearning()
        dtl.fit(X=X_train, y=Y_train)

        # Now getting more test data from LSH indexes
        test_dataset = read_from_pickle('test_dataset.pickle')
        X_test, imageNames = self.create_X_test_as_np_matrix(test_dataset=test_dataset)
        Y_pred = dtl.predict(u=X_test)
        relevant_pred_img_names = [imageNames[i] for i in range(0, len(Y_pred)) if Y_pred[i] == 1]

        length_relevant_images = len(relevant_pred_img_names)
        if length_relevant_images < m:
            irr_image_names = [imageNames[i] for i in range(0, m - length_relevant_images) if Y_pred[i] == -1]
            relevant_pred_img_names.extend(irr_image_names)

        new_obj_feature_matrix = self.database_connection.HOG_descriptor_from_image_ids(
            image_ids=relevant_pred_img_names)

        new_rank_list = get_most_m_similar_images(data_with_images=new_obj_feature_matrix,
                                                  query_image_feature_vector=q_new, m=m)
        return new_rank_list

    def get_PPR_based_feedback(self, q, rel_items, irl_items, obj_feature_matrix, m):
        q_new = self.compute_new_query_vector(q_old=q, relevant_items=rel_items, irrel_items=irl_items)
        topology_images = read_from_pickle('test_dataset.pickle')
        image_names = get_image_names_from_tuples(topology_images)
        db_conn = DatabaseConnection()
        data_image_dict = db_conn.HOG_descriptor_from_image_ids(image_names)
        data_matrix = data_image_dict['data_matrix']
        image_names = data_image_dict['images']
        svd_obj = SingularValueDecomposition()
        svd_image_data = svd_obj.get_transformed_data(data_matrix, 8)  # change this for 11K images

        pg_obj = PageRank()
        image_similarity_matrix = pg_obj.get_image_similarity_matrix_for_top_k_images(6, svd_image_data)
        seed_vector = pg_obj.get_seed_vector(rel_items, image_names, irl_items)
        pie = pg_obj.get_page_rank_eigen_vector(image_similarity_matrix, seed_vector)
        new_rank_list = pg_obj.get_top_K_images_based_on_scores(pie, image_names, m)

        return new_rank_list

    def get_init_ranking(self, obj_feature_matrix,
                         q):  # For SVM, DTC, PPR.... check calculate_init_prob_similarity for Probab based
        svd = singular_value_decomposition.SingularValueDecomposition()
        data_matrix = obj_feature_matrix['data_matrix']
        U, S, Vt = svd.get_latent_semantics(data_matrix=data_matrix, n_components=25)
        init_rank_list = get_most_m_similar_images(data_with_images=obj_feature_matrix, query_image_feature_vector=q,
                                                   Vt=Vt, m=5)
        return init_rank_list, Vt

    # rel_items,irl_items=rf.get_user_feedback(init_rank_list=init_rank_list,q_name=q_name)
    # q_new=rf.compute_new_query_vector(q_old=q,relevant_items=rel_items,irrel_items=irl_items)
    # new_rank_list=get_most_m_similar_images(data_with_images=obj_feature_matrix,query_image_feature_vector=q_new,Vt=Vt,m=5)

    def get_Vt(self, obj_feature_matrix):  # For SVM, DTC, PPR.... check calculate_init_prob_similarity for Probab based
        svd = singular_value_decomposition.SingularValueDecomposition()
        data_matrix = obj_feature_matrix['data_matrix']
        U, S, Vt = svd.get_latent_semantics(data_matrix=data_matrix, n_components=25)
        return Vt

    def get_probabilistic_relevance_feedback(self, D_matrix, images, q_name, m):
        n_i = self.calculate_n_i(D_matrix=D_matrix)
        init_scores = self.calculate_initial_prob_similarity(D_matrix=D_matrix, images=images, n_i=n_i)
        rel_items, irl_items = self.get_user_feedback(init_rank_list=[init_scores[:m]], q_name=q_name, caller='prb')
        new_rank_list = self.calculate_feedback_prob_similarity(D_matrix=D_matrix, images=images,
                                                                relevant_items=rel_items, n_i=n_i)
        return new_rank_list[:m]

    def calculate_feedback_prob_similarity(self, D_matrix, images, relevant_items, n_i):
        N = D_matrix.shape[0]
        R = len(relevant_items)
        n_i = n_i[0]
        r_i = self.calculate_r_i(D_matrix=D_matrix, images=images, relevant_items=relevant_items)
        r_i = r_i[0]

        feedback_scores = {}
        j = 0
        for d in D_matrix:
            sim_score = 0
            for i in range(0, len(n_i)):
                numerator = (r_i[i] + 0.5) / (R + 1 - r_i[i])
                denominator = (n_i[i] - r_i[i] + 0.5) / (N - R + 1 - n_i[i] + r_i[i])
                sim_score = sim_score + d[i] * math.log2(numerator / denominator)

            feedback_scores[images[j]] = sim_score
            j += 1

        feedback_scores = sorted(feedback_scores.items(), key=lambda k: k[1], reverse=True)
        return feedback_scores

    def calculate_initial_prob_similarity(self, D_matrix, images, n_i):
        N = D_matrix.shape[0]
        n_i = n_i[0]

        init_scores = {}

        j = 0
        for d in D_matrix:
            sim_score = 0
            for i in range(0, len(n_i)):
                sim_score = sim_score + d[i] * math.log2((N - n_i[i] + 0.5) / (n_i[i] + 0.5))

            init_scores[images[j]] = sim_score
            j += 1

        init_scores = sorted(init_scores.items(), key=lambda k: k[1], reverse=True)

        return init_scores

    def calculate_r_i(self, D_matrix, images, relevant_items):
        r_i = np.zeros((1, D_matrix.shape[1]))
        i = 0
        for row in D_matrix:
            temp = [1 if row[x] > 0 and images[i] in relevant_items else 0 for x in range(0, len(row))]
            r_i = r_i + np.array(temp).T
            i += 1

        return r_i

    def calculate_n_i(self, D_matrix):

        n_i = np.zeros((1, D_matrix.shape[1]))
        for row in D_matrix:
            temp = [1 if row[x] > 0 else 0 for x in range(0, len(row))]
            n_i = n_i + np.array(temp).T

        return n_i

    def create_X_Y_as_np_matrix(self, rel_items, irl_items):
        X = []
        Y = []

        # Adding relevant items in X and Y
        for item in rel_items:
            fv = self.database_connection.get_feature_data_for_image('histogram_of_gradients', item)
            X.append(fv.reshape(fv.shape[1]))
            Y.append(1)

        # Adding irrelevant items in X and Y
        for item in irl_items:
            fv = self.database_connection.get_feature_data_for_image('histogram_of_gradients', item)
            X.append(fv.reshape(fv.shape[1]))
            Y.append(-1)

        return np.array(X), np.array(Y)

    def create_X_test_as_np_matrix(self, test_dataset):
        X = []
        imageNames = []
        # Adding relevant items in X and Y
        for item in test_dataset:
            fv = self.database_connection.get_feature_data_for_image('histogram_of_gradients', item[0])
            X.append(fv.reshape(fv.shape[1]))
            imageNames.append(item[0])

        return np.array(X), imageNames