Beispiel #1
0
 def compute_threshold(self):
     reconstructed_normalized_feat_desc = self.inv_transform(
         self.objectLatentSemantics)
     reconstructed_feat_desc = self.scaler.inverse_transform(
         reconstructed_normalized_feat_desc)
     reconstruction_err = find_distance_2_vectors(reconstructed_feat_desc,
                                                  self.featureDescriptor)
     # print('shape: ', np.shape(reconstruction_err), np.average(reconstruction_err))
     self.threshold = np.percentile(reconstruction_err, 85)
Beispiel #2
0
def startTask3():
    print("start task3")
    k = input("Please enter the k value for outgoing edges ")
    K = input("Please enter the K value for visualizing dominant images ")
    k = int(k)
    K = int(K)
    folder = input(
        "Please Select the folder to apply Page Rank \n 1. Labelled Set 1 \n 2. Labelled Set 2 \n"
    )
    if folder == "1":
        folderPath = config.IMAGE_FOLDER_SET_1
    else:
        folderPath = config.IMAGE_FOLDER_SET_2

    data = {}

    for file in os.listdir(str(folderPath)):
        filename = os.fsdecode(file)
        fileExists = os.path.exists(
            join(config.FEATURES_FOLDER, file + ".json"))
        if fileExists:
            with open(join(config.FEATURES_FOLDER, filename + ".json"),
                      "r") as f:
                eachData = json.load(f)
                data.update(eachData)
        else:
            data = HOG().HOGForSingleImage(folderPath, file)
            data.update(eachData)
            # mergingFeatureJson.append(data)

    # print(mergingFeatureJson)

    # fileHOGFullExists = os.path.exists(join(config.DATABASE_FOLDER, "HOG.json"))
    #
    # fileExists = os.path.exists(join(config.DATABASE_FOLDER, "HOG_set_2.json"))
    # if not fileExists:
    #     hog = HOG()
    #     featureVector = hog.HOGFeatureDescriptor()
    #
    #     with open(join(config.DATABASE_FOLDER, "HOG_set_2.json"), 'w', encoding='utf-8') as f:
    #         json.dump(featureVector, f, ensure_ascii=True, indent=4)
    #
    # with open(join(config.DATABASE_FOLDER, "HOG_set_2.json"), "r") as f:
    #     data = json.load(f)

    reducerObject = list(data.values())

    pca = PCA_Reducer(reducerObject)
    latentFeatureDict = {}
    data = pca.reduceDimension(pca.featureDescriptor)
    i = 0
    imageNames = []
    for file in os.listdir(str(folderPath)):
        filename = os.fsdecode(file)
        latent = data.iloc[i][:]
        imageNames.append(filename)
        latentFeatureDict[filename] = latent
        i = i + 1

    adjacency_matrix = [[0 for _ in range(len(latentFeatureDict))]
                        for _ in range(len(latentFeatureDict))]
    for i in range(len(latentFeatureDict)):
        distances = []
        for j in range(len(latentFeatureDict)):
            # print(len(latentFeatureDict[imageNames[i]]), len(latentFeatureDict[imageNames[j]]))
            distances.append(
                find_distance_2_vectors(latentFeatureDict[imageNames[i]],
                                        latentFeatureDict[imageNames[j]]))

        distances = np.asarray(distances)
        ind = np.argpartition(distances, k)[:k]
        total = 0
        for distance_index in ind:
            if distances[distance_index] != 0:
                total += 1 / distances[distance_index]
        for distance_index in ind:
            # This is adding only k nearest neighbours into the matrix and doing ratio to get probablistic matrix
            if distances[distance_index] != 0:
                adjacency_matrix[distance_index][
                    i] = 1 / distances[distance_index] / total

    rowDict = {}
    i = 0
    for image in imageNames:
        rowDict[i] = image
        i = i + 1

    df = pd.DataFrame(adjacency_matrix, columns=imageNames)
    df.rename(index=rowDict, inplace=True)

    df.to_csv(join(config.DATABASE_FOLDER, "adjacency_matrix.csv"))

    I = np.identity(df.shape[1])

    print("Enter the three imageIDs to be used as seed")
    imageID_1 = input()
    imageID_2 = input()
    imageID_3 = input()

    seed = pd.Series(0, index=df.index)
    seed.loc[imageID_1] = 0.33
    seed.loc[imageID_2] = 0.33
    seed.loc[imageID_3] = 0.34
    page_rank = np.matmul(np.linalg.inv(I - .75 * df), 0.25 * seed)
    # ind = np.argpartition(page_rank, -K)[-K:]
    # print(page_rank[ind])
    steady_state = pd.Series(page_rank, index=df.index)
    # df.rename(columns={0:"imageName",1:"values"}, inplace=True)
    # steady_state.nlargest(K, ["values"],keep="all")
    steady_state.to_csv(join(config.DATABASE_FOLDER,
                             "steady_state_matrix.csv"))

    col_Names = ["imageNames", "values"]
    my_CSV_File = pd.read_csv(join(config.DATABASE_FOLDER,
                                   "steady_state_matrix.csv"),
                              names=col_Names)
    kDominant = my_CSV_File.nlargest(K, ["values"], keep="all")

    # print(my_CSV_File.nlargest(K, ["values"], keep="all"))
    s = "<style>" \
        "img { width:160px;height:120px" \
        "</style>"
    s = s + "<h2> 3 Seed Images</h2>"
    s = s + "<img src='"
    s = s + join(folderPath, imageID_1)
    s = s + "'>"
    s = s + "<img src='"
    s = s + join(folderPath, imageID_2)
    s = s + "'>"
    s = s + "<img src='"
    s = s + join(folderPath, imageID_3)
    s = s + "'>"
    s = s + "</br></br>"
    s = s + "<h2>" + str(K) + " Dominant Images</h2>"
    for index, row in kDominant.iterrows():
        news = ""
        news = news + "<img src='"
        news = news + join(folderPath, row["imageNames"])
        news = news + "'>"
        s = s + news

    f = open(join(config.DATABASE_FOLDER, "task3.html"), "w")
    f.write(s)
    f.close()

    import webbrowser

    url = join(config.DATABASE_FOLDER, "task3.html")
    # MacOS
    # chrome_path = 'open -a /Applications/Google\ Chrome.app %s'
    # Windows
    chrome_path = 'C:/Program Files (x86)/Google/Chrome/Application/chrome.exe %s'
    # Linux
    # chrome_path = '/usr/bin/google-chrome %s'
    webbrowser.get(chrome_path).open(url)
Beispiel #3
0
def startTask5():
    global hash_rep
    global database
    l = int(input('Enter layer: '))
    k = int(input('Enter hashes per layer: '))

    database = lsh_index_structure(l, k)

    query_img = input('Enter the Query Image: ')
    t = int(input('Most similar images (t): '))

    # Compute hash rep for query img.
    with open(path.join(config.FEATURES_FOLDER, query_img + '.json'),
              'r',
              encoding='utf-8') as f:
        query_feat_desc = json.load(f)

    query_rep = []
    for i in range(len(hash_rep)):
        img_bucket = img_index(query_feat_desc[query_img], i)
        query_rep.append(img_bucket)
    # print(query_rep)

    # Get list of similar imgs
    similar_imgs = []
    bits_to_ignore = 0
    while (1):

        bins_to_consider = set()
        if (bits_to_ignore > 0):
            arr = [None] * bits_to_ignore
            ret_n_size_bin_strings(bits_to_ignore, arr, 0, bins_to_consider)

        for layer_ptr in range(len(database)):
            curr_layer_bit_rep = query_rep[layer_ptr]
            if bits_to_ignore > 0:
                for bin in bins_to_consider:
                    key = curr_layer_bit_rep[:-1 * bits_to_ignore] + bin
                    if key in database[layer_ptr]:
                        similar_imgs.extend(database[layer_ptr][key])
            else:
                if curr_layer_bit_rep in database[layer_ptr]:
                    similar_imgs.extend(
                        database[layer_ptr][curr_layer_bit_rep])

        if (len(similar_imgs) >= t or bits_to_ignore == len(query_rep[0])):
            break  # Also look to break when all the buckets have been scanned.
        else:
            # Reduce the bit size
            bits_to_ignore += 1
            del similar_imgs[:]

    total_imgs = len(similar_imgs)
    unique_similar_imgs = set(similar_imgs)
    unique_imgs = len(unique_similar_imgs)
    print('Number of unique images: ', unique_imgs)
    print('Overall number of images considered: ', total_imgs)

    # Rank all unique images.
    euclid_dist = dict()
    query_fd = query_feat_desc[query_img]
    for img in unique_similar_imgs:
        # print(img)
        with open(path.join(config.FEATURES_FOLDER, img + '.json'),
                  'r',
                  encoding='utf-8') as f:
            img_fd = json.load(f)
        euclid_dist[img] = find_distance_2_vectors(np.asarray(query_fd),
                                                   np.asarray(img_fd[img]))

    sorted_imgs = sorted(euclid_dist.items(), key=lambda kv: kv[1])
    # print(sorted_imgs[:t])

    visualize_for_lsh(sorted_imgs, l, k, t, query_img)
    store_results_as_json(sorted_imgs, l, k, t, query_img)
    store_pickles(l, k)
Beispiel #4
0
def task_6_inp(query_img_fd, l, k, t):
    global hash_rep
    global database
    with open(
            path.join(config.DATABASE_FOLDER,
                      "task5_hashRep_l" + str(l) + "_k" + str(k)), "rb") as f:
        hash_rep = pickle.load(f)
    with open(
            path.join(config.DATABASE_FOLDER,
                      "task5_indexedImgs_l" + str(l) + "_k" + str(k)),
            "rb") as f:
        database = pickle.load(f)

    query_rep = []
    for i in range(len(hash_rep)):
        img_bucket = img_index(query_img_fd, i)
        query_rep.append(img_bucket)
    # print(query_rep)

    # Get list of similar imgs
    similar_imgs = []
    bits_to_ignore = 0
    while (1):

        bins_to_consider = set()
        similar_imgs = []
        if (bits_to_ignore > 0):
            arr = [None] * bits_to_ignore
            ret_n_size_bin_strings(bits_to_ignore, arr, 0, bins_to_consider)

        for layer_ptr in range(len(database)):
            curr_layer_bit_rep = query_rep[layer_ptr]
            if bits_to_ignore > 0:
                for bin in bins_to_consider:
                    key = curr_layer_bit_rep[:-1 * bits_to_ignore] + bin
                    if key in database[layer_ptr]:
                        similar_imgs.extend(database[layer_ptr][key])
            else:
                if curr_layer_bit_rep in database[layer_ptr]:
                    similar_imgs.extend(
                        database[layer_ptr][curr_layer_bit_rep])

        if (len(similar_imgs) >= t or bits_to_ignore == len(query_rep[0])):
            break  # Also look to break when all the buckets have been scanned.
        else:
            # Reduce the bit size
            bits_to_ignore += 1

    total_imgs = len(similar_imgs)
    unique_similar_imgs = set(similar_imgs)
    unique_imgs = len(unique_similar_imgs)
    print('Number of unique images: ', unique_imgs)
    print('Overall number of images considered: ', total_imgs)

    # Rank all unique images.
    euclid_dist = dict()
    query_fd = query_img_fd
    for img in unique_similar_imgs:
        # print(img)
        with open(path.join(config.FEATURES_FOLDER, img + '.json'),
                  'r',
                  encoding='utf-8') as f:
            img_fd = json.load(f)
        euclid_dist[img] = find_distance_2_vectors(np.asarray(query_fd),
                                                   np.asarray(img_fd[img]))

    sorted_imgs = sorted(euclid_dist.items(), key=lambda kv: kv[1])
    # print(sorted_imgs[:t])

    visualize_for_task6(sorted_imgs, l, k, t)
    store_results_for_task6(sorted_imgs, l, k, t)
Beispiel #5
0
def startTask3():
    print("start task3")
    k = input("Please enter the k value for outgoing edges ")
    # K = input("Please enter the K value for visualizing dominant images ")
    k = int(k)
    classify_folder = input("Enter the folder to classify images ")
    fileHOGFullExists = os.path.exists(
        join(config.DATABASE_FOLDER, "HOG_FULL.json"))

    fileExists = os.path.exists(
        join(config.DATABASE_FOLDER, "HOG_classify.json"))
    if not fileExists:
        hog = HOG()
        featureVector = hog.HOGFeatureDescriptor()
        featureVector_classify = hog.HOGFeatureDescriptorForFolder(
            join(config.CLASSIFICATION_FOLDER, classify_folder))
        featureVector.update(featureVector_classify)
        with open(join(config.DATABASE_FOLDER, "HOG_classify.json"),
                  'w+',
                  encoding='utf-8') as f:
            json.dump(featureVector, f, ensure_ascii=True, indent=4)

    with open(join(config.DATABASE_FOLDER, "HOG_classify.json"), "r") as f:
        data = json.load(f)

    reducerObject = list(data.values())

    pca = PCA_Reducer(reducerObject)
    latentFeatureDict = {}
    data = pca.reduceDimension(pca.featureDescriptor)
    print(data.shape)
    i = 0
    imageNames = []
    for file in os.listdir(str(config.IMAGE_FOLDER)):
        filename = os.fsdecode(file)
        latent = data.iloc[i][:]
        imageNames.append(filename)
        latentFeatureDict[filename] = latent
        i = i + 1

    for file in os.listdir(join(config.CLASSIFICATION_FOLDER,
                                classify_folder)):
        filename = os.fsdecode(file)
        latent = data.iloc[i][:]
        imageNames.append(filename)
        latentFeatureDict[filename] = latent
        i = i + 1

    adjacency_matrix = [[0 for _ in range(len(latentFeatureDict))]
                        for _ in range(len(latentFeatureDict))]
    for i in range(len(latentFeatureDict)):
        distances = []
        for j in range(len(latentFeatureDict)):
            distances.append(
                find_distance_2_vectors(latentFeatureDict[imageNames[i]],
                                        latentFeatureDict[imageNames[j]]))

        distances = np.asarray(distances)
        ind = np.argpartition(distances, k)[:k]
        total = 0
        for distance_index in ind:
            if distances[distance_index] != 0:
                total += 1 / distances[distance_index]
        for distance_index in ind:
            # This is adding only k nearest neighbours into the matrix and doing ratio to get probablistic matrix
            if distances[distance_index] != 0:
                adjacency_matrix[distance_index][
                    i] = 1 / distances[distance_index] / total

    rowDict = {}
    i = 0
    for image in imageNames:
        rowDict[i] = image
        i = i + 1

    df = pd.DataFrame(adjacency_matrix, columns=imageNames)
    df.rename(index=rowDict, inplace=True)

    df.to_csv(join(config.DATABASE_FOLDER, "adjacency_matrix.csv"))

    I = np.identity(df.shape[1])

    print("Enter the file where the meta-data of the images is present")
    fileName = input()
    metaData = pd.read_csv(join(config.METADATA_FOLDER, fileName))
    metaData.set_index('imageName')
    count = metaData.loc[metaData['aspectOfHand'].str.contains(
        "dorsal")].shape[0]
    print(count)
    seed = pd.Series(0, index=df.index)
    seed[metaData.loc[metaData['aspectOfHand'].str.contains(
        "dorsal")].imageName] = 1 / count

    page_rank = np.matmul(np.linalg.inv(I - .50 * df), 0.50 * seed)
    steady_state = pd.Series(page_rank, index=df.index)

    steady_state = steady_state.sort_values(ascending=True)
    steady_state.to_csv(join(config.DATABASE_FOLDER,
                             "steady_state_matrix.csv"))
    steady_state.plot()
    plt.show()
Beispiel #6
0
def startTask6():
    print("Task 6")
    feedbackSystem = int(
        input(
            "Please select the relevance feedback system \n1.SVM Based \n2.Decision Tree Based \n3.PPR Based \n4.Probabilistic based\n"
        ))

    filename = input("Please enter the name of the file (output of 5b)")
    with open(join(config.DATABASE_FOLDER, filename), "r") as f:
        data = json.load(f)
    imagesNames = list(data.keys())
    reducerObjectpp = list(data.values())

    pca_pp = PCA_Reducer(reducerObjectpp, k=len(imagesNames))
    latentFeatureDict = {}
    data_pp = pca_pp.reduceDimension(pca_pp.featureDescriptor)

    relavantImages = set()
    irrelavantImages = set()
    iteration = 0
    # Below values are used for PPR, not to calculate everytime iteatively
    rowDict = {}
    calculated = False

    ch = "n"
    while ch == "n" or ch == "N":
        iteration = iteration + 1
        numberOfRelavant = int(input("Number of relevant images "))
        numberOfIrRelavant = int(input("Number of irrelevant images "))
        for i in range(numberOfRelavant):
            relavantImages.add(
                input("Please " + str(i + 1) + " relevant image "))
        for i in range(numberOfIrRelavant):
            irrelavantImages.add(
                input("Please " + str(i + 1) + " irrelevant image "))

        if feedbackSystem == 1:
            print("SVM Based Feedback system")
            image_labels = []
            reducerObject = []
            for i in relavantImages:
                reducerObject.append(data.get(i))
                image_labels.append(-1)
            for i in irrelavantImages:
                reducerObject.append(data.get(i))
                image_labels.append(1)
            pca = PCA_Reducer(reducerObject,
                              k=len(relavantImages) + len(irrelavantImages))
            pca_result = pca.reduceDimension(pca.featureDescriptor)
            svm_object = SVM()
            print("Training SVM")
            svm_object.svm_fit(pca_result, image_labels)
            print("Done Training SVM")
            tempList = list(set(imagesNames) - set(relavantImages))
            unlabelledImages = list(set(tempList) - set(irrelavantImages))
            predicted_values = []
            relevantDistances = {}
            irrelavantDistances = {}
            for i in unlabelledImages:
                pca_output = pca.reduceDimension([data.get(i)])
                # print("pca output: ", pca_output)
                output_label = np.asarray(svm_object.predict(pca_output))[0]
                # print(type(pca_output))
                pca_output = pca_output.values.tolist()
                distance = svm_object.distance(pca_output[0])
                # print(distance)
                # print(type(distance))
                predicted_values.append(output_label)
                if output_label == -1:
                    relevantDistances[distance] = i
                elif output_label == 1:
                    irrelavantDistances[distance] = i

            for i in relavantImages:
                pca_output = pca.reduceDimension([data.get(i)])
                pca_output = pca_output.values.tolist()
                distance = svm_object.distance(pca_output[0])
                # print(distance)
                relevantDistances[distance] = i

            for i in irrelavantImages:
                pca_output = pca.reduceDimension([data.get(i)])
                pca_output = pca_output.values.tolist()
                distance = svm_object.distance(pca_output[0])
                # print(distance)
                irrelavantDistances[distance] = i

            relevantDistancesList = sorted(relevantDistances, reverse=True)
            irrelavantDistancesList = sorted(irrelavantDistances)
            output_images_list = []
            for i in relevantDistancesList:
                output_images_list.append(relevantDistances.get(i))
            for i in irrelavantDistancesList:
                output_images_list.append(irrelavantDistances.get(i))
            # print(output_images_list)
            plotTheResultInChrome(relavantImages, irrelavantImages,
                                  output_images_list, iteration, "SVM")
        elif feedbackSystem == 2:
            print("Decision Tree Based Feedback system")
            reducerObject = []
            for i in relavantImages:
                reducerObject.append(data.get(i))
            for i in irrelavantImages:
                reducerObject.append(data.get(i))
            pca = PCA_Reducer(reducerObject,
                              k=len(relavantImages) + len(irrelavantImages))
            pca_result = pca.reduceDimension(pca.featureDescriptor)
            pca_result = pca_result.values.tolist()
            class_labels = [-1, 1]
            for i in range(0, len(relavantImages)):
                pca_result[i].append(-1)
            count = len(relavantImages)
            for i in range(count, count + len(irrelavantImages)):
                pca_result[i].append(1)

            dtree_object = decisionTree()
            root = dtree_object.construct_dt(pca_result, class_labels, 2, 2)
            tempList = list(set(imagesNames) - set(relavantImages))
            unlabelledImages = list(set(tempList) - set(irrelavantImages))
            relevantConfidence = {}
            irrelevantConfidence = {}
            for i in unlabelledImages:
                pca_output = pca.reduceDimension([data.get(i)])
                pca_output = pca_output.values.tolist()[0]
                output_label = dtree_object.predict(root, pca_output)
                confidence = dtree_object.confidence(root, pca_output,
                                                     output_label)
                if output_label == -1:
                    if relevantConfidence.get(confidence) is None:
                        relevantConfidence[i] = confidence
                else:
                    irrelevantConfidence[i] = confidence

            for i in relavantImages:
                pca_output = pca.reduceDimension([data.get(i)])
                pca_output = pca_output.values.tolist()[0]
                output_label = dtree_object.predict(root, pca_output)
                confidence = dtree_object.confidence(root, pca_output,
                                                     output_label)
                relevantConfidence[i] = confidence

            for i in irrelavantImages:
                pca_output = pca.reduceDimension([data.get(i)])
                pca_output = pca_output.values.tolist()[0]
                output_label = dtree_object.predict(root, pca_output)
                confidence = dtree_object.confidence(root, pca_output,
                                                     output_label)
                irrelevantConfidence[i] = confidence

            relevantConfidenceList = sorted(relevantConfidence.items(),
                                            key=operator.itemgetter(1),
                                            reverse=True)
            irrelavantConfidenceList = sorted(irrelevantConfidence.items(),
                                              key=operator.itemgetter(1))
            output_images_list = []
            # print(relevantConfidenceList)
            # print(irrelavantConfidenceList)
            for key, value in relevantConfidenceList:
                output_images_list.append(key)
            for key, value in irrelavantConfidenceList:
                output_images_list.append(key)
            # print(output_images_list)
            plotTheResultInChrome(relavantImages, irrelavantImages,
                                  output_images_list, iteration,
                                  "Decision Tree")
        elif feedbackSystem == 3:
            print("PPR Based Feedback system")
            if not calculated:
                for i in range(len(imagesNames)):
                    latent = data_pp.iloc[i][:]
                    latentFeatureDict[imagesNames[i]] = latent
                    rowDict[i] = imagesNames[i]

                adjacency_matrix = [[0 for _ in range(len(latentFeatureDict))]
                                    for _ in range(len(latentFeatureDict))]
                # print("")
                print("Generating Adjacency Matrix..")

                for i in range(len(latentFeatureDict)):
                    distances = []
                    for j in range(len(latentFeatureDict)):
                        # print(len(latentFeatureDict[imageNames[i]]), len(latentFeatureDict[imageNames[j]]))
                        distances.append(
                            find_distance_2_vectors(
                                latentFeatureDict[imagesNames[i]],
                                latentFeatureDict[imagesNames[j]]))

                    distances = np.asarray(distances)
                    ind = np.argpartition(distances, 5)[:5]
                    total = 0
                    for distance_index in ind:
                        if distances[distance_index] != 0:
                            total += 1 / distances[distance_index]
                    for distance_index in ind:
                        # This is adding only k nearest neighbours into the matrix and doing ratio to get probablistic matrix
                        if distances[distance_index] != 0:
                            adjacency_matrix[distance_index][
                                i] = 1 / distances[distance_index] / total
                    calculated = True

            seed = pd.Series(0, index=imagesNames)
            length = len(relavantImages)
            for img in relavantImages:
                seed.loc[img] = 1 / length

            seed2 = pd.Series(0, index=imagesNames)
            length2 = len(irrelavantImages)
            for img in irrelavantImages:
                seed2.loc[img] = 1 / length2
            df = pd.DataFrame(adjacency_matrix, columns=imagesNames)
            df.rename(index=rowDict, inplace=True)

            df.to_csv(
                join(config.DATABASE_FOLDER, "adjacency_matrix_task6_c.csv"))

            I = np.identity(df.shape[1])
            page_rank = np.matmul(np.linalg.inv(I - .75 * df), 0.25 * seed)
            page_rank2 = np.matmul(np.linalg.inv(I - .75 * df), 0.25 * seed2)

            steady_state = pd.Series(page_rank, index=df.index)
            steady_state2 = pd.Series(page_rank2, index=df.index)
            steady_state.to_csv(
                join(config.DATABASE_FOLDER,
                     "steady_state_matrix_6_c_" + str(iteration) + ".csv"))
            finalResult = {}
            for i in range(len(imagesNames)):
                finalResult[imagesNames[i]] = steady_state[
                    imagesNames[i]] - steady_state2[imagesNames[i]]

            # finalResult = list(finalResult.keys())
            sortList = sorted(finalResult.items(),
                              key=lambda x: x[1],
                              reverse=True)
            finalResult = list(dict(sortList).keys())
            plotTheResultInChrome(relavantImages, irrelavantImages,
                                  finalResult, iteration, "PPR")

        elif feedbackSystem == 4:
            images_df = pd.read_json(join(config.DATABASE_FOLDER, filename),
                                     "r")
            threshold = 0.02
            nQuery = []
            for q in range(images_df.shape[0]):
                nq = 0
                rq = 0
                irq = 0
                for column in images_df:
                    if images_df[column][q] >= threshold:
                        nq += 1
                        if column in relavantImages:
                            rq += 1
                        if column in irrelavantImages:
                            irq += 1
                pq = (rq + nq / images_df.shape[1]) / (len(relavantImages) + 1)
                uq = (irq + nq / images_df.shape[1]) / (len(irrelavantImages) +
                                                        1)
                if pq * (1 - uq) / (uq * (1 - pq) + 1) <= 0:
                    nQuery.append(0)
                else:
                    q = math.log((pq * (1 - uq)) / (uq * (1 - pq)), 10)
                    if q < 0:
                        nQuery.append(0)
                    elif q > 1:
                        nQuery.append(1)
                    else:
                        nQuery.append(q)
            finalResult = {}
            for i in range(len(imagesNames)):
                product = np.dot(nQuery, reducerObjectpp[i])
                finalResult[imagesNames[i]] = product

            sortList = sorted(finalResult.items(),
                              key=lambda x: x[1],
                              reverse=True)
            finalResult = list(dict(sortList).keys())
            plotTheResultInChrome(relavantImages, irrelavantImages,
                                  finalResult, iteration, "Probabilistic")
        else:
            print("Wrong input")
            exit()

        ch = input(
            "Are you satisfied with the output? type Y for exit N for running again "
        )
Beispiel #7
0
def startTask4():
    print("starting task4")
    print("Enter the folder path containing the labeled images")
    training_folder = input()

    print("Choose one of the below classifier")
    print("1. SVM classifer\n2. Decision-Tree classifier\n3. PPR based classifier")
    classifier = int(input())

    print("Enter the folder path containing the test images")
    test_folder = input()

    hog_feature_map = {}
    counter = 1
    training_files = os.listdir(training_folder)
    print("Extracting features for the training images!")
    for trainingFile in training_files:
        trainingFileJson = os.fsdecode(trainingFile).split('.')[0] + '.' + os.fsdecode(trainingFile).split('.')[
            1] + '.json'
        fileExists = os.path.exists(join(config.FEATURES_FOLDER, trainingFileJson))
        if fileExists:
            with open(join(config.FEATURES_FOLDER, trainingFileJson), "r") as f:
                data = json.load(f)
                hog_feature_map.update(data)
        else:
            data = HOG().HOGForSingleImage(training_folder, trainingFile)
            hog_feature_map.update(data)
        progress(counter, len(training_files))
        counter = counter + 1
    reducer_object = list(hog_feature_map.values())
    print("Performing PCA!")
    pca = PCA_Reducer(reducer_object)
    data = pca.reduceDimension(pca.featureDescriptor)
    print("Done performing PCA!")
    if classifier == 1:
        # image labels are added to the imageLabels list. -1 for dorsal and 1 for palmar
        metadata = pd.read_csv(config.METADATA_FOLDER)
        image_lables = get_labels(training_folder, metadata)
        svm_object = SVM()
        print("Training SVM")
        svm_object.svm_fit(data, image_lables)
        print("Done Training SVM")
        test_labels_map = {}
        predicted_values = []
        actual_values = get_labels(test_folder, metadata)
        for file in os.listdir(test_folder):
            test_file = file
            test_file_json = file + '.json'
            file_exists = os.path.exists(join(config.FEATURES_FOLDER, test_file_json))
            if file_exists:
                with open(join(config.FEATURES_FOLDER, test_file_json), "r") as f:
                    data = json.load(f)
            else:
                data = HOG().HOGForSingleImage(test_folder, test_file)
            pca_output = pca.reduceDimension(list(data.values()))
            output_label = np.asarray(svm_object.predict(pca_output))[0]
            predicted_values.append(output_label)
            if output_label == -1:
                test_labels_map[test_file] = "dorsal"
            else:
                test_labels_map[test_file] = "palmar"
        print(test_labels_map)
        accuracy = accuracy_score(actual_values, predicted_values)
        plotInChromeForTask4(test_labels_map, "Task_4_SVM", accuracy)
        print("Test Accuracy: ", accuracy)

    if classifier == 2:
        data = data.values.tolist()  # decision tree takes data as 2d array
        class_labels = [-1, 1]
        i = 0
        metadata = pd.read_csv(config.METADATA_FOLDER)
        for file in os.listdir(training_folder):
            training_file = os.fsdecode(file)
            label = metadata.loc[metadata['imageName'] == training_file]['aspectOfHand'].iloc[0]
            if "dorsal" in label:
                data[i].append(-1)
            else:
                data[i].append(1)
            i = i + 1
        dtree_object = decisionTree()
        root = dtree_object.construct_dt(data, class_labels, 5, 2)
        test_labels_map = {}
        predicted_values = []
        actual_values = get_labels(test_folder, metadata)
        for file in os.listdir(test_folder):
            test_file = os.fsdecode(file).split('.')[0] + '.' + os.fsdecode(file).split('.')[1]
            test_file_json = os.fsdecode(file).split('.')[0] + '.' + os.fsdecode(file).split('.')[1] + '.json'
            file_exists = os.path.exists(join(config.FEATURES_FOLDER, test_file_json))
            if file_exists:
                with open(join(config.FEATURES_FOLDER, test_file_json), "r") as f:
                    data = json.load(f)
            else:
                data = HOG().HOGForSingleImage(test_folder, test_file)
            pca_output = pca.reduceDimension(list(data.values()))
            pca_output = pca_output.values.tolist()[0]
            output_label = dtree_object.predict(root, pca_output)
            predicted_values.append(output_label)
            if output_label == -1:
                test_labels_map[test_file] = "dorsal"
            else:
                test_labels_map[test_file] = "palmar"
        accuracy = accuracy_score(actual_values, predicted_values)
        plotInChromeForTask4(test_labels_map, "Task_4_DECISION", accuracy)
        print("Test Accuracy: ", accuracy)

    if classifier == 3:
        pca_for_all = data

        i = 0
        imageNames = []
        latentFeatureDict = {}

        # Preprocessing for UnLabelled set
        ppr_hog_map = {}
        for test_file in os.listdir(test_folder):
            trainingFileJson = str(test_file) + '.json'
            fileExists = os.path.exists(join(config.FEATURES_FOLDER, trainingFileJson))
            if fileExists:
                with open(join(config.FEATURES_FOLDER, trainingFileJson), "r") as f:
                    data = json.load(f)
                    ppr_hog_map.update(data)
            else:
                data = HOG().HOGForSingleImage(test_folder, test_file)
                ppr_hog_map.update(data)
        # Appending the labelled data values with unlabelled images data
        reducer_object = list(hog_feature_map.values())
        pp_reducer_object = list(ppr_hog_map.values())
        pp_reducer_object = reducer_object + pp_reducer_object
        pca = PCA_Reducer(pp_reducer_object)
        unlabelled_ppr_data = pca.reduceDimension(pca.featureDescriptor)
        pca_for_all = unlabelled_ppr_data

        for file in os.listdir(str(training_folder)):
            filename = os.fsdecode(file)
            latent = pca_for_all.iloc[i][:]
            imageNames.append(filename)
            latentFeatureDict[filename] = latent
            i = i + 1

        for file in os.listdir(join(test_folder)):
            filename = os.fsdecode(file)
            latent = pca_for_all.iloc[i][:]
            imageNames.append(filename)
            latentFeatureDict[filename] = latent
            i = i + 1

        # seed = pd.Series(0, index=imageNames)
        print("Generating Adjacency Matrix..")
        adjacency_matrix = [[0 for _ in range(len(latentFeatureDict))] for _ in range(len(latentFeatureDict))]
        for i in range(len(latentFeatureDict)):
            distances = []
            for j in range(len(latentFeatureDict)):
                # print(len(latentFeatureDict[imageNames[i]]), len(latentFeatureDict[imageNames[j]]))
                distances.append(find_distance_2_vectors(latentFeatureDict[imageNames[i]],
                                                         latentFeatureDict[imageNames[j]]))

            distances = np.asarray(distances)
            ind = np.argpartition(distances, 20)[:20]
            total = 0
            for distance_index in ind:
                if distances[distance_index] != 0:
                    total += 1 / distances[distance_index]
            for distance_index in ind:
                # This is adding only k nearest neighbours into the matrix and doing ratio to get probablistic matrix
                if distances[distance_index] != 0:
                    adjacency_matrix[distance_index][i] = 1 / distances[distance_index] / total

        rowDict = {}
        i = 0
        for image in imageNames:
            rowDict[i] = image
            i = i + 1

        df = pd.DataFrame(adjacency_matrix, columns=imageNames)
        df.rename(index=rowDict, inplace=True)

        df.to_csv(join(config.DATABASE_FOLDER, "adjacency_matrix_for_task_4.csv"))

        I = np.identity(df.shape[1])
        seed = pd.Series(0, index=imageNames)
        metadata = pd.read_csv(config.METADATA_FOLDER)
        image_lables = get_labels(training_folder, metadata)
        count = image_lables.count(-1)
        val = 1 / count
        for i in range(len(os.listdir(training_folder))):
            if image_lables[i] == -1:
                seed.loc[imageNames[i]] = val
        # print(seed)
        seed2 = pd.Series(0, index=imageNames)
        count2 = image_lables.count(1)
        val2 = 1 / count2
        for i in range(len(os.listdir(training_folder))):
            if image_lables[i] == 1:
                seed2.loc[imageNames[i]] = val

        page_rank = np.matmul(np.linalg.inv(I - .75 * df), 0.25 * seed)
        page_rank2 = np.matmul(np.linalg.inv(I - .75 * df), 0.25 * seed2)
        steady_state = pd.Series(page_rank, index=df.index)
        steady_state2 = pd.Series(page_rank2, index=df.index)
        test_labels_map = {}
        predicted_values = []
        for file in os.listdir(join(test_folder)):
            if steady_state[file] >= steady_state2[file]:
                test_labels_map[file] = "dorsal"
                predicted_values.append(-1)
            else:
                test_labels_map[file] = "palmer"
                predicted_values.append(1)

        actual_values = get_labels(test_folder, metadata)
        accuracy = accuracy_score(actual_values, predicted_values)
        plotInChromeForTask4(test_labels_map, "Task_4_PPR", accuracy)
        print("Test Accuracy: ", accuracy)
        steady_state = steady_state.sort_values(ascending=True)
        steady_state.to_csv(join(config.DATABASE_FOLDER, "steady_state_matrix_for_task_4.csv"))
        steady_state.plot()
        plt.show()