Ejemplo n.º 1
0
def get_levenshtein_sim(str1, str2):
    levenshtein = Levenshtein()
    if str1 == 'nan' or str2 == 'nan' or str1 == '' or str2 == '':
        return -1.0
    else:
        max_length = max(len(str1), len(str2))
        return 1.0 - levenshtein.distance(str1, str2) / max_length
Ejemplo n.º 2
0
def is_similar(value, strings):
    """
    Checks is a string is similar to one in a set of strings
    :param value:
    :param strings:
    :return:
    """
    levenshtein = Levenshtein()
    for s in strings:
        if levenshtein.distance(value, s) < (len(value) / 2):
            return True
    return False
    def string_similarity(self, word1, word2):
        
        levenshtein = Levenshtein()
        lcs = LongestCommonSubsequence()

        ed = levenshtein.distance(word1, word2)
        if(ed == 0):
            sigma = (lcs.length(word1, word2)) / (
                    min(len(word1),len(word2)) * 1)
        else:
            sigma = (lcs.length(word1, word2)) / (
                    min(len(word1),len(word2)) * ed)
        return sigma
Ejemplo n.º 4
0
def getCorrectWordUsingBigramIndex(word):
    bigram_index = indexer.bigramIndex()
    possible_words = {}
    levenshtein = Levenshtein()
    bigram = indexer.getBigramForWord(word)
    for b in bigram:
        if b in bigram_index.keys():
            for term in bigram_index[b]:
                possible_words[term] = 0
    for p_word in possible_words:
        possible_words[p_word] = levenshtein.distance(word, p_word)
    possible_words = OrderedDict(
        sorted(possible_words.items(), key=lambda kv: kv[1]))
    return list(possible_words.keys())[0]
def levention():
    l = []
    df = jac()
    q1 = dt.getQues1(df)
    q2 = dt.getQues2(df)
    levenshtein = Levenshtein()
    for i in range(len(q1)):
        w1 = q1[i]
        w2 = q2[i]
        z = levenshtein.distance(w1, w2)
        l.append(z)
    df.insert(15, 'levenshtein', " ")
    x = dt.setleven(df, l)
    return df
Ejemplo n.º 6
0
def ent_incorp(plots, stories):
    # calculate entity incorporation in storyline and story
    # return average score of all lines
    entity_numbers = re.compile("(?<=\sent\s)\d+")
    ent_rate = []
    for plot, story in zip(plots, stories):
        plot_entities = list(entity_numbers.findall(plot))
        story_entities = list(entity_numbers.findall(story))
        levenshtein = Levenshtein()
        incorp_ent = levenshtein.distance(plot_entities, story_entities) # compute edit distance between the ent storyline and story
        try:
            ent_rate_each = incorp_ent/max(len(plot_entities), len(story_entities)) # normalise the distance
        except ZeroDivisionError:
            ent_rate_each = 0
        ent_rate.append(ent_rate_each)

        #print("Entity incorporation rate %:")
        #print("Mean: {:.2f} Min: {:.2f} Max: {:.2f} StDev {:.2f}".format(min(ent_rate), max(ent_rate),
        #                                                                 mean(ent_rate), std(ent_rate)))
    return mean(ent_rate), min(ent_rate), max(ent_rate), std(ent_rate)
Ejemplo n.º 7
0
def respond(strg):
    levenshtein = Levenshtein()
    stemmer = StemmerFactory().create_stemmer()
    stopwords = StopWordRemoverFactory().create_stop_word_remover()

    kategori = model.predict([strg])

    txt = stopwords.remove(strg)
    txt = stemmer.stem(txt)

    best = 1000
    res = []

    for words in dataset:
        if (words['category'] == kategori):
            distance = levenshtein.distance(txt, words['message_stemmed'])

            if (distance < best):
                best = distance
                res = words
    return res['respond']
Ejemplo n.º 8
0
    def get_replacement(self, distance='lsh', threshold=.8):
        if distance == 'edit_distance':
            distance = Levenshtein()
        elif distance == 'normalized_edit_distance':
            distance = NormalizedLevenshtein()

        # for each token, get its bin
        # for each bin, iterate each element and get the groups of satisfied tokens such as
        # [white] = [whit, whie, whit]
        # [whie] = [whine,white]

        replacement = {}
        s = self.uniq_values

        while len(s) > 0:
            token = rd.sample(s, 1)[0]
            s.remove(token)
            m = self._generate_hash(token)
            similarities = self.lsh.query(m)
            similarities = [
                _ for _ in similarities if _ not in replacement.values()
                and _ not in replacement.keys()
            ]
            if len(similarities) > 1:
                scores = {}
                bin_replacement = {}
                if distance != 'lsh':
                    for idx, item in enumerate(similarities):
                        count = 0
                        candidates = []
                        for idx_compared in range(idx + 1, len(similarities)):
                            candidate = similarities[idx_compared]
                            if item != candidate and distance.distance(
                                    item, candidate) < threshold:
                                if idx not in bin_replacement:
                                    bin_replacement[idx] = [idx_compared]
                                else:
                                    bin_replacement[idx].append(idx_compared)
                                if idx_compared not in bin_replacement:
                                    bin_replacement[idx_compared] = [idx]
                                else:
                                    bin_replacement[idx_compared].append(idx)

                    for idx_item, candidates in sorted(
                            bin_replacement.items(), key=lambda x: -len(x[1])):
                        item = similarities[idx_item]
                        if item in replacement.keys():
                            item = replacement[item]
                        for idx_candidate in candidates:
                            candidate = similarities[idx_candidate]
                            if candidate != item and candidate not in replacement.keys(
                            ):
                                if item not in replacement.keys():
                                    replacement[candidate] = item
                                elif replacement[item] != candidate:
                                    replacement[candidate] = replacement[item]
                else:
                    for candidate in similarities:
                        if candidate != token:
                            replacement[candidate] = token

        return replacement
Ejemplo n.º 9
0
    def similarity(self, question, answer):

        stopword = self.read_from(folder_path + '上证专用停用词.txt')
        stopwords = []
        for sw in stopword:
            sw = sw.strip('\n')
            sw = sw.strip(' ')
            stopwords.append(sw)
        # print(stopwords)

        meaningful_words1 = []
        meaningful_words2 = []

        words2 = jieba.cut(str(question))
        words3 = jieba.cut(str(answer))
        for word in words2:
            if word not in stopwords:
                meaningful_words1.append(word)
        for word in words3:
            if word not in stopwords:
                meaningful_words2.append(word)
        s2 = ''.join(meaningful_words1)
        # print(s2)
        s3 = ''.join(meaningful_words2)
        a1 = Cosine(1)
        b1 = Damerau()
        c1 = Jaccard(1)
        d1 = JaroWinkler()
        e1 = Levenshtein()
        f1 = LongestCommonSubsequence()
        g1 = MetricLCS()
        h1 = NGram(2)
        i1 = NormalizedLevenshtein()
        j1 = OptimalStringAlignment()
        k1 = QGram(1)
        l1 = SorensenDice(2)
        m1 = WeightedLevenshtein(character_substitution=CharSub())

        line_sim = []

        cos_s = a1.similarity(s2, s3)
        line_sim.append(cos_s)
        cos_d = a1.distance(s2, s3)
        line_sim.append(cos_d)
        dam = b1.distance(s2, s3)
        line_sim.append(dam)
        jac_d = c1.distance(s2, s3)
        line_sim.append(jac_d)
        jac_s = c1.similarity(s2, s3)
        line_sim.append(jac_s)
        jar_d = d1.distance(s2, s3)
        line_sim.append(jar_d)
        jar_s = d1.similarity(s2, s3)
        line_sim.append(jar_s)
        lev = e1.distance(s2, s3)
        line_sim.append(lev)
        lon = f1.distance(s2, s3)
        line_sim.append(lon)
        met = g1.distance(s2, s3)
        line_sim.append(met)
        ngr = h1.distance(s2, s3)
        line_sim.append(ngr)
        nor_d = i1.distance(s2, s3)
        line_sim.append(nor_d)
        nor_s = i1.similarity(s2, s3)
        line_sim.append(nor_s)
        opt = j1.distance(s2, s3)
        line_sim.append(opt)
        qgr = k1.distance(s2, s3)
        line_sim.append(qgr)
        sor_d = l1.distance(s2, s3)
        line_sim.append(sor_d)
        sor_s = l1.similarity(s2, s3)
        line_sim.append(sor_s)
        wei = m1.distance(s2, s3)
        line_sim.append(wei)

        return line_sim
                     left_index=True,
                     right_index=True).reset_index()

# compare the similarity of just column_0 to filter out questions that are too similar to
# one another where the question is essentially the same, but with a different object
# e.g. "Have you taken: cocaine" or "Have you taken: opiods"
levenshtein = Levenshtein()

index = 1
threshold = 0.9
reference_string = questions["column_0"].iloc[0]

while index < len(questions) - 1:
    string_1 = reference_string
    string_2 = questions["column_0"].iloc[index]
    levenshtein_distance = levenshtein.distance(string_1, string_2)
    if len(string_1) > len(string_2):
        similarity = 1 - levenshtein_distance / len(string_1)
    else:
        similarity = 1 - levenshtein_distance / len(string_2)
    if similarity > threshold:
        questions = questions.drop(questions.index[[index]])
        index += 1
    else:
        reference_string = questions["column_0"].iloc[index]
        index += 1

# clean up the questions by setting all to lower case and stripping punctuation
questions["Question"] = questions["Question"].str.lower()

# compare the similarity of each whole question and drop the questions that are very similar to one another
Ejemplo n.º 11
0
from similarity.levenshtein import Levenshtein
from similarity.normalized_levenshtein import NormalizedLevenshtein
from similarity.cosine import Cosine
lev = Levenshtein()
nolev = NormalizedLevenshtein()
cosine = Cosine(4)
str1 = 'I enjoy playing football'
str2 = 'I love to play soccer'

print(lev.distance(str1, str2))
print('Levenshtein distance:')
print(nolev.similarity(str1, str2))
print('Cosine similarity:')
print(cosine.similarity(str1, str2))