def guess_one(cls, text): ''' Try to guess license from a string. Try to exact match on identifier then slugified title and fallback on edit distance ranking (after slugification) ''' if not text: return qs = cls.objects text = text.strip().lower() # Stored identifiers are lower case slug = cls.slug.slugify(text) # Use slug as it normalize string license = qs( db.Q(id__iexact=text) | db.Q(slug=slug) | db.Q(url__iexact=text) | db.Q(alternate_urls__iexact=text)).first() if license is None: # Try to single match with a low Damerau-Levenshtein distance computed = ((l, rdlevenshtein(l.slug, slug)) for l in cls.objects) candidates = [l for l, d in computed if d <= MAX_DISTANCE] # If there is more that one match, we cannot determinate # which one is closer to safely choose between candidates if len(candidates) == 1: license = candidates[0] if license is None: # Try to single match with a low Damerau-Levenshtein distance computed = ((l, rdlevenshtein(cls.slug.slugify(t), slug)) for l in cls.objects for t in l.alternate_titles) candidates = [l for l, d in computed if d <= MAX_DISTANCE] # If there is more that one match, we cannot determinate # which one is closer to safely choose between candidates if len(candidates) == 1: license = candidates[0] return license
def guess_one(cls, text): ''' Try to guess license from a string. Try to exact match on identifier then slugified title and fallback on edit distance ranking (after slugification) ''' if not text: return qs = cls.objects text = text.strip().lower() # Stored identifiers are lower case slug = cls.slug.slugify(text) # Use slug as it normalize string license = qs( db.Q(id__iexact=text) | db.Q(slug=slug) | db.Q(url__iexact=text) | db.Q(alternate_urls__iexact=text)).first() if license is None: # If we're dealing with an URL, let's try some specific stuff # like getting rid of trailing slash and scheme mismatch try: url = validate_url(text) except ValidationError: pass else: parsed = urlparse(url) path = parsed.path.rstrip('/') query = f'{parsed.netloc}{path}' license = qs( db.Q(url__icontains=query) | db.Q(alternate_urls__contains=query)).first() if license is None: # Try to single match `slug` with a low Damerau-Levenshtein distance computed = ((l, rdlevenshtein(l.slug, slug)) for l in cls.objects) candidates = [l for l, d in computed if d <= MAX_DISTANCE] # If there is more that one match, we cannot determinate # which one is closer to safely choose between candidates if len(candidates) == 1: license = candidates[0] if license is None: # Try to match `title` with a low Damerau-Levenshtein distance computed = ((l, rdlevenshtein(l.title.lower(), text)) for l in cls.objects) candidates = [l for l, d in computed if d <= MAX_DISTANCE] # If there is more that one match, we cannot determinate # which one is closer to safely choose between candidates if len(candidates) == 1: license = candidates[0] if license is None: # Try to single match `alternate_titles` with a low Damerau-Levenshtein distance computed = ((l, rdlevenshtein(cls.slug.slugify(t), slug)) for l in cls.objects for t in l.alternate_titles) candidates = [l for l, d in computed if d <= MAX_DISTANCE] # If there is more that one license matching, we cannot determinate # which one is closer to safely choose between candidates if len(set(candidates)) == 1: license = candidates[0] return license
def get_closest_k_words(word, k): closest_words = [("", sys.maxsize, 0) for _ in range(k)] if not word: return closest_words closest_dist = sys.maxsize for w in freq: rdldist = sd.rdlevenshtein(w, word) if rdldist <= closest_words[0][1]: closest_dist = rdldist ## (WORD, CLOSENESS, FREQ) candidate = (w, rdldist, freq[w]) closest_words = update(closest_words, candidate) return closest_words
def guess_one(cls, text): ''' Try to guess license from a string. Try to exact match on identifier then slugified title and fallback on edit distance ranking (after slugification) ''' if not text: return qs = cls.objects text = text.strip().lower() # Stored identifiers are lower case slug = cls.slug.slugify(text) # Use slug as it normalize string license = qs( db.Q(id=text) | db.Q(slug=slug) | db.Q(url=text) | db.Q(alternate_urls=text) ).first() if license is None: # Try to single match with a low Damerau-Levenshtein distance computed = ((l, rdlevenshtein(l.slug, slug)) for l in cls.objects) candidates = [l for l, d in computed if d <= MAX_DISTANCE] # If there is more that one match, we cannot determinate # which one is closer to safely choose between candidates if len(candidates) == 1: license = candidates[0] if license is None: # Try to single match with a low Damerau-Levenshtein distance computed = ( (l, rdlevenshtein(cls.slug.slugify(t), slug)) for l in cls.objects for t in l.alternate_titles ) candidates = [l for l, d in computed if d <= MAX_DISTANCE] # If there is more that one match, we cannot determinate # which one is closer to safely choose between candidates if len(candidates) == 1: license = candidates[0] return license
def test_rdlevenshtein_transposition(self): """It should return correct distance when transposition is involved""" self.assertEqual(rdlevenshtein('abced', 'abcde'), 1)
def test_rdlevenshtein_substitution(self): """It should return correct distance when substitution is involved""" self.assertEqual(rdlevenshtein('abcd!', 'abcde'), 1)
def test_rdlevenshtein_matching(self): """It should return correct distance when strings match""" self.assertEqual(rdlevenshtein('abcde', 'abcde'), 0)
def Analysis(): ''' The main idea behind this is to observe and analyse the patterns in which the CSRF tokens are generated by server. ''' ctr = 0 # Counter variable set to 0 # Checking if the no of tokens is greater than 1 if len(REQUEST_TOKENS) > 1: verbout(color.RED, '\n +--------------+') verbout(color.RED, ' | Analysis |') verbout(color.RED, ' +--------------+\n') print(GR + 'Proceeding for post-scan analysis of tokens gathered...') verbout( G, 'A total of %s tokens was discovered during the scan' % (len(REQUEST_TOKENS))) # The idea behind this is to generate all possible combinations (not # considering permutations) from the given list of discovered tokens # and generate anti-CSRF token generation pattern. for tokenx1, tokenx2 in itertools.combinations(REQUEST_TOKENS, 2): try: verbout( GR, 'Analysing 2 Anti-CSRF Tokens from gathered requests...') verbout(color.CYAN, ' [+] First Token: ' + color.BLUE + tokenx1) verbout( color.ORANGE, ' [+] Shannon Entropy: ' + color.GREEN + '%s' % (calcEntropy(tokenx1))) verbout(color.CYAN, ' [+] Second Token: ' + color.BLUE + tokenx2) verbout( color.ORANGE, ' [+] Shannon Entropy: ' + color.GREEN + '%s' % (calcEntropy(tokenx2))) # Calculating the edit distance via Damerau Levenshtein algorithm m = stringdist.rdlevenshtein(tokenx1, tokenx2) verbout( color.CYAN, ' [+] Edit Distance Calculated: ' + color.GREY + str(m) + '%') # Now its time to detect the alignment ratio n = stringdist.rdlevenshtein_norm(tokenx1, tokenx2) verbout( color.CYAN, ' [+] Alignment Ratio Calculated: ' + color.GREY + str(n)) # If both tokens are same, then if len(tokenx1) == len(tokenx2): verbout( C, 'Token length calculated is same: ' + color.ORANGE + 'Each %s bytes' % len(byteString(tokenx1))) else: verbout( C, 'Token length calculated is different: ' + color.ORANGE + 'By %s bytes' % (len(byteString(tokenx1)) - len(byteString(tokenx2)))) time.sleep(0.5) # In my experience with web security assessments, often the Anti-CSRF token # is composed of two parts, one of them remains static while the other one dynamic. # # For example, if the Anti CSRF Tokens “837456mzy29jkd911139” for one request, the # other is “837456mzy29jkd337221”, “837456mzy29jkd” part of the token remains same # in both requests. # # The main idea behind this is to detect the static and dynamic part via DL Algorithm # as discussed above by calculating edit distance. p = sameSequence(tokenx1, tokenx2) tokenx01 = tokenx1.replace(p, '') tokenx02 = tokenx2.replace(p, '') if n == 0.5 or m == len(tokenx1) / 2: verbout( GR, 'The tokens are composed of 2 parts (one static and other dynamic)... ' ) verbout( C, 'Static Part : ' + color.GREY + p + color.END + ' | Length: ' + color.CYAN + str(len(p))) verbout( O, 'Dynamic Part of Token 0x1: ' + color.GREY + tokenx01 + color.END + ' | Length: ' + color.CYAN + str(len(tokenx01))) verbout( O, 'Dynamic Part of Token 0x2: ' + color.GREY + tokenx02 + color.END + ' | Length: ' + color.CYAN + str(len(tokenx02))) if len(len(tokenx1) / 2) <= 6: verbout( color.RED, ' [-] Post-Analysis reveals that token might be ' + color.BR + ' VULNERABLE ' + color.END + '!') print(color.RED + ' [+] Possible CSRF Vulnerability Detected!') print(color.ORANGE + ' [!] Vulnerability Type: ' + color.BR + ' Weak Dynamic Part of Tokens ' + color.END) print(color.GREY + ' [+] Tokens can easily be ' + color.RED + 'Forged by Bruteforcing/Guessing' + color.END + '!\n') VulnLogger( 'Analysis', 'Tokens can easily be Forged by Bruteforcing/Guessing.', '[i] Token 1: ' + tokenx1 + '\n[i] Token 2: ' + tokenx2) elif n < 0.5 or m < len(tokenx1) / 2: verbout( R, 'Token distance calculated is ' + color.RED + 'less than 0.5!') verbout( C, 'Static Part : ' + color.GREY + p + color.END + ' | Length: ' + color.CYAN + str(len(p))) verbout( O, 'Dynamic Part of Token 0x1: ' + color.GREY + tokenx01 + color.END + ' | Length: ' + color.CYAN + str(len(tokenx01))) verbout( O, 'Dynamic Part of Token 0x2: ' + color.GREY + tokenx02 + color.END + ' | Length: ' + color.CYAN + str(len(tokenx02))) verbout( color.RED, ' [-] Post-Analysis reveals that token might be ' + color.BR + ' VULNERABLE ' + color.END + '!') print(color.GREEN + ' [+] Possible CSRF Vulnerability Detected!') print(color.ORANGE + ' [!] Vulnerability Type: ' + color.BR + ' Weak Dynamic Part of Tokens ' + color.END) print(color.GREY + ' [+] Tokens can easily be ' + color.RED + 'Forged by Bruteforcing/Guessing' + color.END + '!\n') VulnLogger( 'Analysis', 'Tokens can easily be Forged by Bruteforcing/Guessing.', '[i] Token 1: ' + tokenx1 + '\n[i] Token 2: ' + tokenx2) else: verbout( R, 'Token distance calculated is ' + color.GREEN + 'greater than 0.5!') verbout( C, 'Static Part : ' + color.GREY + p + color.END + ' | Length: ' + color.CYAN + str(len(p))) verbout( O, 'Dynamic Part of Token 0x1: ' + color.GREY + tokenx01 + color.END + ' | Length: ' + color.CYAN + str(len(tokenx01))) verbout( O, 'Dynamic Part of Token 0x2: ' + color.GREY + tokenx02 + color.END + ' | Length: ' + color.CYAN + str(len(tokenx02))) verbout( color.GREEN, ' [+] Post-Analysis reveals that tokens are ' + color.BG + ' NOT VULNERABLE ' + color.END + '!') print(color.ORANGE + ' [!] Vulnerability Mitigation: ' + color.BG + ' Strong Dynamic Part of Tokens ' + color.END) print(color.GREY + ' [+] Tokens ' + color.GREEN + 'Cannot be Forged by Bruteforcing/Guessing' + color.END + '!\n') NovulLogger( 'Analysis', 'Tokens cannot be Forged by Bruteforcing/Guessing.') time.sleep(1) except KeyboardInterrupt: continue print(C + 'Post-Scan Analysis Completed!')
def label(exp, termset): if not exp: return None # If there's an exact match, we're done if exp in termset: return (exp, exp, "exact_full") #### OK, so, no exact matches, we're going to look for misspellings, subwords, exact within the larger string # First, cut down potential matches to those that are shorter or almost shorter than exp (within edit distance range) exp_len = len(exp) terms = [t for t in termset if len(t) <= exp_len + 2] #### First look for almost exact matches based on edit distance #Compute edit distance for every term and Take all terms that have an edit distance of 2 or less term_to_edit_dist = [(t, rdlevenshtein(t, exp)) for t in terms] term_to_edit_dist = [x for x in term_to_edit_dist if x[1] < 3] # sort by edit distance, then length term_to_edit_dist = sorted(term_to_edit_dist, key=cmp_to_key(my_cmp)) if len(term_to_edit_dist): for t, dist in term_to_edit_dist: if passes_edit_dist_ratio(dist, t, exp): # Take the best match -> longest and lowest edit best_match = term_to_edit_dist[0] return (exp, best_match[0], "editdist") # OK, nothing obvious, so we need to search through the string #### First, look for exact matches, these can be either subwords or full matches exact_matches = [t for t in terms if t in exp] # if there's any exact matches, return them if len(exact_matches): # order by length exact_matches = sorted(exact_matches, key=lambda x: -len(x)) # take the longest matched_term = exact_matches[0] return determine_subword(matched_term, matched_term, exp) ### OK, otherwise we're in a situation where we have a fuzzy match to a subset of the string # We find all strings that have an edit distance of <3 somewhere within the string fuzzy_matches = [] for t in terms: # bigger edit dist, but then use RDleveinstein fuzzy_match = find_near_matches(t, exp, max_l_dist=2) if fuzzy_match: matched_text = exp[fuzzy_match[0].start:fuzzy_match[0].end] if passes_edit_dist_ratio(fuzzy_match[0].dist, matched_text, t): fuzzy_matches.append((matched_text, t)) if len(fuzzy_matches): # sort the fuzzy matches by length of matched substring and return fuzzy_matches = sorted(fuzzy_matches, key=lambda x: -len(x[0])) return determine_subword(fuzzy_matches[0][0], fuzzy_matches[0][1], exp, "fuzzy_") # OK, otherwise, no matches! return None
def main(argv): topic = argv[0] filelang = argv[1] mainlang = argv[2] path = "/home/oyku/embeddings/fasttext/wiki." + filelang + ".align.vec" dictionary = load_vec(path) mono_path = "/home/oyku/monolingual_fasttext/cc." + filelang + ".300" mono_wv = fText.load_fasttext_format(mono_path) file = "/home/oyku/myversion/oov_words/" + mainlang + "/" + topic + "_" + filelang + ".txt" f = open(file, 'r', encoding='utf8') content = f.readlines() cont = set() for el in content: if not el.strip().isdigit(): cont.add(el.strip()) print("The number of OOVs: " + str(len(content))) print("The number of word OOVs: " + str(len(cont))) ## Morphologic morphs = {} for blob in cont: if not blob.isdigit(): text = Text(blob) text.language = filelang morphemes = [] for morp in text.morphemes: if len(morp) > 3 and morp in dictionary: morphemes.append(morp) if len(morphemes) != 0: morphs[blob] = morphemes print("Morphologic check is over") left = cont.difference(morphs) ## Spelling spellex = {} for oov in left: if len(oov) > 2: possibles = [] for inv in dictionary: if stringdist.rdlevenshtein(oov, inv) == 1: possibles.append(inv) if len(possibles) == 1: spellex[oov] = possibles print("Spelling check is over") next_left = left.difference(spellex) fasttext_bin = {} for oov in next_left: try: similars = mono_wv.wv.most_similar(oov.strip()) most_sim = "" for sim in similars: if sim[0] in dictionary and sim[1] > 0.5: most_sim = sim[0] break if most_sim != "": fasttext_bin[oov.strip()] = [most_sim] except: continue print("Fasttext check is over") print("-----------------------------------------------") print("Identified with morphologic analysis: " + str(len(morphs))) print("Identified with spell analysis: " + str(len(spellex))) print("Identified with Fasttext: " + str(len(fasttext_bin))) union = union3(morphs, spellex, fasttext_bin) print("Total: " + str(len(union))) saved_path = "/home/oyku/myversion/oov_matches/" + mainlang + "/" + topic + "_" + filelang + ".p" pickle.dump(union, open(saved_path, "wb"))
def test_lev(self): self.assertEqual(sd.rdlevenshtein('abcd!', 'abcde'), 1)