Python rdlevenshteinの例、stringdist.rdlevenshtein Pythonの例

コード例 #1

0

ファイルを表示

    def guess_one(cls, text):
        '''
        Try to guess license from a string.

        Try to exact match on identifier then slugified title
        and fallback on edit distance ranking (after slugification)
        '''
        if not text:
            return
        qs = cls.objects
        text = text.strip().lower()  # Stored identifiers are lower case
        slug = cls.slug.slugify(text)  # Use slug as it normalize string
        license = qs(
            db.Q(id__iexact=text) | db.Q(slug=slug) | db.Q(url__iexact=text)
            | db.Q(alternate_urls__iexact=text)).first()
        if license is None:
            # Try to single match with a low Damerau-Levenshtein distance
            computed = ((l, rdlevenshtein(l.slug, slug)) for l in cls.objects)
            candidates = [l for l, d in computed if d <= MAX_DISTANCE]
            # If there is more that one match, we cannot determinate
            # which one is closer to safely choose between candidates
            if len(candidates) == 1:
                license = candidates[0]
        if license is None:
            # Try to single match with a low Damerau-Levenshtein distance
            computed = ((l, rdlevenshtein(cls.slug.slugify(t), slug))
                        for l in cls.objects for t in l.alternate_titles)
            candidates = [l for l, d in computed if d <= MAX_DISTANCE]
            # If there is more that one match, we cannot determinate
            # which one is closer to safely choose between candidates
            if len(candidates) == 1:
                license = candidates[0]
        return license

コード例 #2

0

ファイルを表示

    def guess_one(cls, text):
        '''
        Try to guess license from a string.

        Try to exact match on identifier then slugified title
        and fallback on edit distance ranking (after slugification)
        '''
        if not text:
            return
        qs = cls.objects
        text = text.strip().lower()  # Stored identifiers are lower case
        slug = cls.slug.slugify(text)  # Use slug as it normalize string
        license = qs(
            db.Q(id__iexact=text) | db.Q(slug=slug) | db.Q(url__iexact=text)
            | db.Q(alternate_urls__iexact=text)).first()

        if license is None:
            # If we're dealing with an URL, let's try some specific stuff
            # like getting rid of trailing slash and scheme mismatch
            try:
                url = validate_url(text)
            except ValidationError:
                pass
            else:
                parsed = urlparse(url)
                path = parsed.path.rstrip('/')
                query = f'{parsed.netloc}{path}'
                license = qs(
                    db.Q(url__icontains=query)
                    | db.Q(alternate_urls__contains=query)).first()

        if license is None:
            # Try to single match `slug` with a low Damerau-Levenshtein distance
            computed = ((l, rdlevenshtein(l.slug, slug)) for l in cls.objects)
            candidates = [l for l, d in computed if d <= MAX_DISTANCE]
            # If there is more that one match, we cannot determinate
            # which one is closer to safely choose between candidates
            if len(candidates) == 1:
                license = candidates[0]

        if license is None:
            # Try to match `title` with a low Damerau-Levenshtein distance
            computed = ((l, rdlevenshtein(l.title.lower(), text))
                        for l in cls.objects)
            candidates = [l for l, d in computed if d <= MAX_DISTANCE]
            # If there is more that one match, we cannot determinate
            # which one is closer to safely choose between candidates
            if len(candidates) == 1:
                license = candidates[0]

        if license is None:
            # Try to single match `alternate_titles` with a low Damerau-Levenshtein distance
            computed = ((l, rdlevenshtein(cls.slug.slugify(t), slug))
                        for l in cls.objects for t in l.alternate_titles)
            candidates = [l for l, d in computed if d <= MAX_DISTANCE]
            # If there is more that one license matching, we cannot determinate
            # which one is closer to safely choose between candidates
            if len(set(candidates)) == 1:
                license = candidates[0]
        return license

コード例 #3

0

ファイルを表示

def get_closest_k_words(word, k):
    closest_words = [("", sys.maxsize, 0) for _ in range(k)]
    if not word:
        return closest_words
    closest_dist = sys.maxsize
    for w in freq:
        rdldist = sd.rdlevenshtein(w, word)
        if rdldist <= closest_words[0][1]:
            closest_dist = rdldist
            ## (WORD, CLOSENESS, FREQ)
            candidate = (w, rdldist, freq[w])
            closest_words = update(closest_words, candidate)
    return closest_words

コード例 #4

0

ファイルを表示

ファイル: models.py プロジェクト: odtvince/udata

    def guess_one(cls, text):
        '''
        Try to guess license from a string.

        Try to exact match on identifier then slugified title
        and fallback on edit distance ranking (after slugification)
        '''
        if not text:
            return
        qs = cls.objects
        text = text.strip().lower()  # Stored identifiers are lower case
        slug = cls.slug.slugify(text)  # Use slug as it normalize string
        license = qs(
            db.Q(id=text) | db.Q(slug=slug) | db.Q(url=text)
            | db.Q(alternate_urls=text)
        ).first()
        if license is None:
            # Try to single match with a low Damerau-Levenshtein distance
            computed = ((l, rdlevenshtein(l.slug, slug)) for l in cls.objects)
            candidates = [l for l, d in computed if d <= MAX_DISTANCE]
            # If there is more that one match, we cannot determinate
            # which one is closer to safely choose between candidates
            if len(candidates) == 1:
                license = candidates[0]
        if license is None:
            # Try to single match with a low Damerau-Levenshtein distance
            computed = (
                (l, rdlevenshtein(cls.slug.slugify(t), slug))
                for l in cls.objects
                for t in l.alternate_titles
            )
            candidates = [l for l, d in computed if d <= MAX_DISTANCE]
            # If there is more that one match, we cannot determinate
            # which one is closer to safely choose between candidates
            if len(candidates) == 1:
                license = candidates[0]
        return license

コード例 #5

0

ファイルを表示

 def test_rdlevenshtein_transposition(self):
     """It should return correct distance when transposition is involved"""
     self.assertEqual(rdlevenshtein('abced', 'abcde'), 1)

コード例 #6

0

ファイルを表示

 def test_rdlevenshtein_substitution(self):
     """It should return correct distance when substitution is involved"""
     self.assertEqual(rdlevenshtein('abcd!', 'abcde'), 1)

コード例 #7

0

ファイルを表示

 def test_rdlevenshtein_matching(self):
     """It should return correct distance when strings match"""
     self.assertEqual(rdlevenshtein('abcde', 'abcde'), 0)

コード例 #8

0

ファイルを表示

ファイル: Analysis.py プロジェクト: yskPs/XSRFProbe

def Analysis():
    '''
    The main idea behind this is to observe and analyse
           the patterns in which the CSRF tokens
                  are generated by server.
    '''
    ctr = 0  # Counter variable set to 0
    # Checking if the no of tokens is greater than 1
    if len(REQUEST_TOKENS) > 1:
        verbout(color.RED, '\n +--------------+')
        verbout(color.RED, ' |   Analysis   |')
        verbout(color.RED, ' +--------------+\n')
        print(GR + 'Proceeding for post-scan analysis of tokens gathered...')
        verbout(
            G, 'A total of %s tokens was discovered during the scan' %
            (len(REQUEST_TOKENS)))
        # The idea behind this is to generate all possible combinations (not
        # considering permutations) from the given list of discovered tokens
        # and generate anti-CSRF token generation pattern.
        for tokenx1, tokenx2 in itertools.combinations(REQUEST_TOKENS, 2):
            try:
                verbout(
                    GR,
                    'Analysing 2 Anti-CSRF Tokens from gathered requests...')
                verbout(color.CYAN,
                        ' [+] First Token: ' + color.BLUE + tokenx1)
                verbout(
                    color.ORANGE, ' [+] Shannon Entropy: ' + color.GREEN +
                    '%s' % (calcEntropy(tokenx1)))
                verbout(color.CYAN,
                        ' [+] Second Token: ' + color.BLUE + tokenx2)
                verbout(
                    color.ORANGE, ' [+] Shannon Entropy: ' + color.GREEN +
                    '%s' % (calcEntropy(tokenx2)))
                # Calculating the edit distance via Damerau Levenshtein algorithm
                m = stringdist.rdlevenshtein(tokenx1, tokenx2)
                verbout(
                    color.CYAN, ' [+] Edit Distance Calculated: ' +
                    color.GREY + str(m) + '%')
                # Now its time to detect the alignment ratio
                n = stringdist.rdlevenshtein_norm(tokenx1, tokenx2)
                verbout(
                    color.CYAN,
                    ' [+] Alignment Ratio Calculated: ' + color.GREY + str(n))
                # If both tokens are same, then
                if len(tokenx1) == len(tokenx2):
                    verbout(
                        C, 'Token length calculated is same: ' + color.ORANGE +
                        'Each %s bytes' % len(byteString(tokenx1)))
                else:
                    verbout(
                        C, 'Token length calculated is different: ' +
                        color.ORANGE + 'By %s bytes' %
                        (len(byteString(tokenx1)) - len(byteString(tokenx2))))
                time.sleep(0.5)
                # In my experience with web security assessments, often the Anti-CSRF token
                # is composed of two parts, one of them remains static while the other one dynamic.
                #
                # For example, if the Anti CSRF Tokens “837456mzy29jkd911139” for one request, the
                # other is “837456mzy29jkd337221”, “837456mzy29jkd” part of the token remains same
                # in both requests.
                #
                # The main idea behind this is to detect the static and dynamic part via DL Algorithm
                # as discussed above by calculating edit distance.
                p = sameSequence(tokenx1, tokenx2)
                tokenx01 = tokenx1.replace(p, '')
                tokenx02 = tokenx2.replace(p, '')
                if n == 0.5 or m == len(tokenx1) / 2:
                    verbout(
                        GR,
                        'The tokens are composed of 2 parts (one static and other dynamic)... '
                    )
                    verbout(
                        C, 'Static Part : ' + color.GREY + p + color.END +
                        ' | Length: ' + color.CYAN + str(len(p)))
                    verbout(
                        O, 'Dynamic Part of Token 0x1: ' + color.GREY +
                        tokenx01 + color.END + ' | Length: ' + color.CYAN +
                        str(len(tokenx01)))
                    verbout(
                        O, 'Dynamic Part of Token 0x2: ' + color.GREY +
                        tokenx02 + color.END + ' | Length: ' + color.CYAN +
                        str(len(tokenx02)))
                    if len(len(tokenx1) / 2) <= 6:
                        verbout(
                            color.RED,
                            ' [-] Post-Analysis reveals that token might be ' +
                            color.BR + ' VULNERABLE ' + color.END + '!')
                        print(color.RED +
                              ' [+] Possible CSRF Vulnerability Detected!')
                        print(color.ORANGE + ' [!] Vulnerability Type: ' +
                              color.BR + ' Weak Dynamic Part of Tokens ' +
                              color.END)
                        print(color.GREY + ' [+] Tokens can easily be ' +
                              color.RED + 'Forged by Bruteforcing/Guessing' +
                              color.END + '!\n')
                        VulnLogger(
                            'Analysis',
                            'Tokens can easily be Forged by Bruteforcing/Guessing.',
                            '[i] Token 1: ' + tokenx1 + '\n[i] Token 2: ' +
                            tokenx2)
                elif n < 0.5 or m < len(tokenx1) / 2:
                    verbout(
                        R, 'Token distance calculated is ' + color.RED +
                        'less than 0.5!')
                    verbout(
                        C, 'Static Part : ' + color.GREY + p + color.END +
                        ' | Length: ' + color.CYAN + str(len(p)))
                    verbout(
                        O, 'Dynamic Part of Token 0x1: ' + color.GREY +
                        tokenx01 + color.END + ' | Length: ' + color.CYAN +
                        str(len(tokenx01)))
                    verbout(
                        O, 'Dynamic Part of Token 0x2: ' + color.GREY +
                        tokenx02 + color.END + ' | Length: ' + color.CYAN +
                        str(len(tokenx02)))
                    verbout(
                        color.RED,
                        ' [-] Post-Analysis reveals that token might be ' +
                        color.BR + ' VULNERABLE ' + color.END + '!')
                    print(color.GREEN +
                          ' [+] Possible CSRF Vulnerability Detected!')
                    print(color.ORANGE + ' [!] Vulnerability Type: ' +
                          color.BR + ' Weak Dynamic Part of Tokens ' +
                          color.END)
                    print(color.GREY + ' [+] Tokens can easily be ' +
                          color.RED + 'Forged by Bruteforcing/Guessing' +
                          color.END + '!\n')
                    VulnLogger(
                        'Analysis',
                        'Tokens can easily be Forged by Bruteforcing/Guessing.',
                        '[i] Token 1: ' + tokenx1 + '\n[i] Token 2: ' +
                        tokenx2)
                else:
                    verbout(
                        R, 'Token distance calculated is ' + color.GREEN +
                        'greater than 0.5!')
                    verbout(
                        C, 'Static Part : ' + color.GREY + p + color.END +
                        ' | Length: ' + color.CYAN + str(len(p)))
                    verbout(
                        O, 'Dynamic Part of Token 0x1: ' + color.GREY +
                        tokenx01 + color.END + ' | Length: ' + color.CYAN +
                        str(len(tokenx01)))
                    verbout(
                        O, 'Dynamic Part of Token 0x2: ' + color.GREY +
                        tokenx02 + color.END + ' | Length: ' + color.CYAN +
                        str(len(tokenx02)))
                    verbout(
                        color.GREEN,
                        ' [+] Post-Analysis reveals that tokens are ' +
                        color.BG + ' NOT VULNERABLE ' + color.END + '!')
                    print(color.ORANGE + ' [!] Vulnerability Mitigation: ' +
                          color.BG + ' Strong Dynamic Part of Tokens ' +
                          color.END)
                    print(color.GREY + ' [+] Tokens ' + color.GREEN +
                          'Cannot be Forged by Bruteforcing/Guessing' +
                          color.END + '!\n')
                    NovulLogger(
                        'Analysis',
                        'Tokens cannot be Forged by Bruteforcing/Guessing.')
                time.sleep(1)
            except KeyboardInterrupt:
                continue
        print(C + 'Post-Scan Analysis Completed!')

コード例 #9

0

ファイルを表示

def label(exp, termset):
    if not exp:
        return None

    # If there's an exact match, we're done
    if exp in termset:
        return (exp, exp, "exact_full")

    #### OK, so, no exact matches, we're going to look for misspellings, subwords, exact within the larger string

    # First, cut down potential matches to those that are shorter or almost shorter than exp (within edit distance range)
    exp_len = len(exp)
    terms = [t for t in termset if len(t) <= exp_len + 2]

    #### First look for almost exact matches based on edit distance
    #Compute edit distance for every term and Take all terms that have an edit distance of 2 or less
    term_to_edit_dist = [(t, rdlevenshtein(t, exp)) for t in terms]
    term_to_edit_dist = [x for x in term_to_edit_dist if x[1] < 3]
    # sort by edit distance, then length
    term_to_edit_dist = sorted(term_to_edit_dist, key=cmp_to_key(my_cmp))
    if len(term_to_edit_dist):
        for t, dist in term_to_edit_dist:
            if passes_edit_dist_ratio(dist, t, exp):
                # Take the best match -> longest and lowest edit
                best_match = term_to_edit_dist[0]
                return (exp, best_match[0], "editdist")

    # OK, nothing obvious, so we need to search through the string

    #### First, look for exact matches, these can be either subwords or full matches
    exact_matches = [t for t in terms if t in exp]

    # if there's any exact matches, return them
    if len(exact_matches):
        # order by length
        exact_matches = sorted(exact_matches, key=lambda x: -len(x))
        # take the longest
        matched_term = exact_matches[0]

        return determine_subword(matched_term, matched_term, exp)

    ### OK, otherwise we're in a situation where we have a fuzzy match to a subset of the string

    # We find all strings that have an edit distance of <3 somewhere within the string
    fuzzy_matches = []
    for t in terms:
        # bigger edit dist, but then use RDleveinstein
        fuzzy_match = find_near_matches(t, exp, max_l_dist=2)
        if fuzzy_match:
            matched_text = exp[fuzzy_match[0].start:fuzzy_match[0].end]
            if passes_edit_dist_ratio(fuzzy_match[0].dist, matched_text, t):
                fuzzy_matches.append((matched_text, t))

    if len(fuzzy_matches):
        # sort the fuzzy matches by length of matched substring and return
        fuzzy_matches = sorted(fuzzy_matches, key=lambda x: -len(x[0]))
        return determine_subword(fuzzy_matches[0][0], fuzzy_matches[0][1], exp,
                                 "fuzzy_")

    # OK, otherwise, no matches!
    return None

コード例 #10

0

ファイルを表示

ファイル: oov_coverage.py プロジェクト: oykut/clrl

def main(argv):
    topic = argv[0]
    filelang = argv[1]
    mainlang = argv[2]

    path = "/home/oyku/embeddings/fasttext/wiki." + filelang + ".align.vec"
    dictionary = load_vec(path)

    mono_path = "/home/oyku/monolingual_fasttext/cc." + filelang + ".300"
    mono_wv = fText.load_fasttext_format(mono_path)

    file = "/home/oyku/myversion/oov_words/" + mainlang + "/" + topic + "_" + filelang + ".txt"
    f = open(file, 'r', encoding='utf8')
    content = f.readlines()

    cont = set()

    for el in content:
        if not el.strip().isdigit():
            cont.add(el.strip())

    print("The number of OOVs: " + str(len(content)))
    print("The number of word OOVs: " + str(len(cont)))

    ## Morphologic
    morphs = {}
    for blob in cont:
        if not blob.isdigit():
            text = Text(blob)
            text.language = filelang
            morphemes = []
            for morp in text.morphemes:

                if len(morp) > 3 and morp in dictionary:
                    morphemes.append(morp)

            if len(morphemes) != 0:
                morphs[blob] = morphemes

    print("Morphologic check is over")

    left = cont.difference(morphs)

    ## Spelling
    spellex = {}
    for oov in left:
        if len(oov) > 2:
            possibles = []
            for inv in dictionary:
                if stringdist.rdlevenshtein(oov, inv) == 1:
                    possibles.append(inv)
            if len(possibles) == 1:
                spellex[oov] = possibles

    print("Spelling check is over")

    next_left = left.difference(spellex)

    fasttext_bin = {}
    for oov in next_left:
        try:
            similars = mono_wv.wv.most_similar(oov.strip())

            most_sim = ""
            for sim in similars:
                if sim[0] in dictionary and sim[1] > 0.5:
                    most_sim = sim[0]
                    break

            if most_sim != "":
                fasttext_bin[oov.strip()] = [most_sim]
        except:
            continue

    print("Fasttext check is over")

    print("-----------------------------------------------")

    print("Identified with morphologic analysis: " + str(len(morphs)))
    print("Identified with spell analysis: " + str(len(spellex)))
    print("Identified with Fasttext: " + str(len(fasttext_bin)))

    union = union3(morphs, spellex, fasttext_bin)
    print("Total: " + str(len(union)))

    saved_path = "/home/oyku/myversion/oov_matches/" + mainlang + "/" + topic + "_" + filelang + ".p"
    pickle.dump(union, open(saved_path, "wb"))

コード例 #11

0

ファイルを表示

ファイル: tests.py プロジェクト: jlee7x2/autocomplete-autocorrect

 def test_lev(self):
     self.assertEqual(sd.rdlevenshtein('abcd!', 'abcde'), 1)