Example #1
0
def get_best_indices(list, sin_val):
    ''' The function takes on single row and finds out the best indexes according to similarity distance. The similarity values used are
        Euclidean distance, Manhattan distance, Minkowski distance, Cosine distance and Jaccard distance.
        It returns a dictionary of list'''

    ### local optima saves a dictionary where dictionary is like { distance_type: [best_distance_value, best_lowest_index, best_upper_index] }
    local_optima = {
        "Euclidean": [9999999999, 9999999, 99999999],
        "Manhattan": [9999999999, 9999999, 99999999],
        "Minkowski": [9999999999, 9999999, 99999999],
        "Cosine": [9999999999, 9999999, 99999999],
        "Jaccard": [9999999999, 9999999, 99999999]
    }

    measures = Similarity()  ### Calling Similarity class
    size = len(sin_val)  ### size of sine value list which is 40

    for i in range(len(list) - size):

        ### Euclidean Portion
        val = measures.euclidean_distance(list[i:i + size], sin_val)
        if val <= local_optima["Euclidean"][0]:
            local_optima["Euclidean"] = [val, i, i + size]

        ### Manhattan Portion
        val = measures.manhattan_distance(list[i:i + size], sin_val)
        if val <= local_optima["Manhattan"][0]:
            local_optima["Manhattan"] = [val, i, i + size]

        ### Minkowski Portion
        val = measures.minkowski_distance(list[i:i + size], sin_val, 3)
        if val <= local_optima["Minkowski"][0]:
            local_optima["Minkowski"] = [val, i, i + size]

        ### Cosine Portion
        val = measures.cosine_similarity(list[i:i + size], sin_val)
        if val <= local_optima["Cosine"][0]:
            local_optima["Cosine"] = [val, i, i + size]

        ### Jaccard Portion
        val = measures.jaccard_similarity(list[i:i + size], sin_val)
        if val <= local_optima["Jaccard"][0]:
            local_optima["Jaccard"] = [val, i, i + size]

    return local_optima
Example #2
0
for x in tf_idf.toarray()[0]:
    print(x, end=",")
# count = sum([1 for x in tf_idf.toarray()[0] if x != 0])
# print(count)

# Query text
query_path = 'data\\test-bong-da-trong-nuoc.txt'
tf_idf_query = compute_tf_idf_query(query_path, dictionary, idf)
print(tf_idf_query[0])
count = sum([1 for x in tf_idf_query.toarray()[0] if x != 0])
print('\nquery keyword count: ' + str(count))

# calculate similarity between tf_idf_query with tf_idf of corpus
similarity = Similarity()
# similarities = similarity.cosine_similarity_sprase_matrix(tf_idf, tf_idf_query).flatten()
similarities = []
for tf_idf_e in tf_idf:
    # print(tf_idf_e.toarray()[0])
    similarities.append(
        similarity.cosine_similarity(tf_idf_e.toarray()[0],
                                     tf_idf_query.toarray()[0]))

# display similarity descending between query input with corpus
# sort similarity
similarities, files = zip(*sorted(zip(similarities, files), reverse=True))
print('\n10 file có độ tương đồng giảm dần:\n')
sim_re, files_res = similarities[:10], files[:10]

for i in range(len(files_res)):
    print('%s %f' % (files_res[i], sim_re[i]))