Esempio n. 1
0
def GetScore(src_name, input_name, min_score):
    src_name = src_name.translate(str.maketrans('', '', string.punctuation))
    input_name = input_name.translate(
        str.maketrans('', '', string.punctuation))
    jarowinkler = JaroWinkler()

    result = []

    total_score_scr_part = 0
    for input_name_part in input_name.split():
        column = []
        for src_name_part in src_name.split():
            winkler_part = jarowinkler.similarity(
                input_name_part, src_name_part)
            difference = ParsedDifference(input_name_part, src_name_part)

            avg = (winkler_part + difference) / 2

            column.append(avg)
        result.append(max(column))

    full_inputted_jaro = jarowinkler.similarity(input_name, src_name)
    score = Average(result)
    if (full_inputted_jaro > score):
        score = full_inputted_jaro
    return score * 100
    def fuzzy_line_equality_detection(self, lines):
        new_lines = []

        jarowinkler = JaroWinkler()
        #Compare all lines against each other
        for k in range(len(lines.split("\n"))):
            max_sim = 0
            for l in range(len(lines.split("\n"))):
                if k == l: continue
                jaro_sim = jarowinkler.similarity(
                    lines.split("\n")[k].lower(),
                    lines.split("\n")[l].lower())

                #Get maximum similarity
                if jaro_sim > max_sim:
                    max_sim = jaro_sim

            #If maximum similarity >= similarity threshold: make all tokens technical(T)
            if max_sim >= self.similarity_threshold and lines.split(
                    "\n")[k].replace(" ", ""):
                new_lines.append(" ".join(
                    [w + "_T" for w in lines.split("\n")[k].split(" ")]))
            else:
                new_lines.append(" ".join(
                    [w + "_N" for w in lines.split("\n")[k].split(" ")]))

        return "\n".join(new_lines)
for i in range(len(data)):

    temp_article.append(data[i][0])

print(len(data))

my_string = "human moblity prediction spatiotemporal next place future location point-of-interest hotspot forecasting modelling mobility behaviors traffic trajectory mobile phone"

p = []

filter_thresh_45 = []

for i in range(len(temp_article)):

    jarowinkler = JaroWinkler()

    sim = jarowinkler.similarity(my_string, temp_article[i])

    if sim > 0.45:

        filter_thresh_45.append(data[i])

normalized_levenshtein = NormalizedLevenshtein()

filter_normalized_levenshtein = []

for i in range(len(filter_thresh_45)):

    sim = normalized_levenshtein.distance(my_string, filter_thresh_45[i][0])