def levenshtein_method(cdr3, cutoff=1): edgelist = set() combos = [comb for comb in combinations(list(cdr3), 2)] for combo in combos: d = lev(combo[0], combo[1]) if d <= cutoff: edgelist.add(combo[0] + "\t" + combo[1] + "\t" + str(d)) return edgelist
def have_same_root(word1, word2): w1_stem = stemmer.stemWord(word1) w2_stem = stemmer.stemWord(word2) if w1_stem == w2_stem or lev(w1_stem, w2_stem) <= 2: return True comm = common_letters(w1_stem,w2_stem) if comm < 4: return False if comm > 6: return True diff = abs(len(max(w1_stem,w2_stem))-comm) if diff > comm or diff > 2: return False return True
def get_similar_code(processed_code, language, level): filename = "coursedata/similar-code-files/" + language + "/level" + str(level) + ".csv" shortest_distance = 75 # This is the threshold: when differ more than this value it's no longer similar code similar_code = None try: with open(filename, mode='r', encoding='utf-8') as file: csvFile = csv.reader(file, quoting=csv.QUOTE_MINIMAL) for lines in csvFile: distance = lev(processed_code, lines[1]) if distance < 1: # The code is identical, no need to search any further similar_code = lines[0] break else: if distance < shortest_distance: shortest_distance = distance similar_code = lines[0] except: similar_code = None return similar_code
def getCloserMatch(word, wordlist): """ Search closest match (using Levenshtein distance) Args: word: The word to search in the list. wordlist: Path to the wordlist to perform the search into. Returns: A tuple holding the matches array and Levenshtein distance between the word variable and these matches. Example: >>> getCloserMatch("azerto", "wordlists/french_top1000.txt") Levenshtein distance: 1 Match: azerty Match: azert """ try: f = open(wordlist, 'r') passwords = f.read().splitlines() f.close() except: print("Could not find required file") exit(1) min_distance = math.inf matches = [] for password in passwords: distance = lev(word, password) if distance < min_distance: min_distance = distance matches = [password] elif distance == min_distance: matches.append(password) return matches, min_distance
clean.append(poljeOrig[x - 1]) #stavljanje u string i ciscenje sil, uzdah, buka i greska segmenata stringGlasova = "" for x in range(len(clean)): if str(clean[x]) != "sil" and str(clean[x]) != "uzdah" and str( clean[x]) != "buka" and str(clean[x]) != "greska": stringGlasova = stringGlasova + clean[x] #ispis prepoznatog i originalnog stringa glasova print "\nISPIS ORIGINAL I PREPOZNATO" print stringLab print stringGlasova #racunanje preciznosti s levenshtein distancom edit_dist = lev(stringLab, stringGlasova) print "\nPRECIZNOST" if odabirDistance == "M": print "KORISTECI MAHALANOBISOVU UDALJENOST" elif odabirDistance == "E": print "KORISTECI EUKLIDOVU UDALJENOST" print str(100 - float(edit_dist) / len(labData) * 100) + "%" #formatiranje i zapisivanje u out.lab start = 0 distribution = len(mfcc_feat) * 100000 / len(clean) text_file = open("out/" + audioSignal.split(".")[0] + "Out.lab", "w") text_file.write("") text_file.close() for x in range(len(clean)): text_file = open("out/" + audioSignal.split(".")[0] + "Out.lab", "a")
#!/usr/bin/env python from Levenshtein import distance as lev def pairwise(iterable): a = iter(iterable) return zip(a, a) with open("out.txt", "r") as _file: errors = [] for line in _file: if not line[:2] == "--": errors.append(line[2:].strip()) _sum = 0 for transcribed, gold in pairwise(errors): _sum = _sum + lev(transcribed, gold) msg = f"""Out of {round(len(errors)/2)} errors, the average edit distance was {_sum/(len(errors)/2)}""" print(msg)
def levenshtein(x, y, n=None): if n is not None: x = x[:n] y = y[:n] return lev(x, y) / (max(len(x), len(y)) if max(len(x), len(y)) > 0 else 1)
def extract_text_chat_screen(frame, frame_count=0): # cv2.imshow("crop_img", crop_img) global ChatLogArray img = frame.copy() # convert to grayscale gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # threshold ret, bin = cv2.threshold(gray, 245, 255, cv2.THRESH_BINARY) # closing kernel = np.ones((3, 3), np.uint8) closing = cv2.morphologyEx(bin, cv2.MORPH_CLOSE, kernel) # invert black/white inv = cv2.bitwise_not(closing) # cv2.imshow("img_outline", closing) # cv2.waitKey() height = 1080 # a boolean which indicated if the messages from the # person playing the game should also be included include_player_messages = True # because the iterator might need to change, don't use for loop y = 0 # start at the top while y < 750: # stop when the chat input box is reached y = y + 1 if y > 750: # reached the end of chat. So stop here break x = 288 # in the case of 1920 x 1080 is_message_box = False # Check if there is a message box at this y if closing[y, x] == 255 and closing[y + 100, x] == 255: is_message_box = True if is_message_box: end_y = y + 10 while closing[end_y, x] == 255: end_y = end_y + 1 crop_img_text = closing[y + 50:end_y - 2, 420:1253] if crop_img_text.size == 0: print("failure in grabbing text from chat") y = end_y + 10 continue OCR_result = get_text_from_image(crop_img_text) # how many pixels to move downwards to find the color of the person color_person_y = 50 # get the color values of the player saying the word b, g, r = frame[y + color_person_y, x + 100] # get the color name based on the RGB values color_name = utils.get_color_name(r, g, b) # variable which indicates if a chat message is already logged # This is done to prevent duplicates already_logged = False for chat in ChatLogArray: if color_name == chat.colorName: if chat.message == OCR_result: already_logged = True # Also check if the levenshtein distance == 1 in case of OCR errors # if the distance is only 1, then it very likely saw the same message, # which is usually caused by an OCR error reading a character the wrong way levenshtein_distance = lev(chat.message, OCR_result) if levenshtein_distance == 1: already_logged = True # if the chat entry doesn't exist yet, create it if not already_logged: # now get the name through OCR crop_img = closing[y:y + 50, 420:1253] OCR_result_name = get_text_from_image(crop_img) # now put it all in the chat logger new_log = ChatLog(len(ChatLogArray), r, g, b, color_name, frame_count, OCR_result_name, OCR_result) ChatLogArray.append(new_log) y = end_y + 10 # now do the same for the player's chat messages. y = 0 while (y < 750): y = y + 1 if y > 750: # reached the end of chat. So stop here break x_player = 1375 # player messages in the case of 1920 x 1080 is_message_box_player = False if closing[y, x_player] == 255 and closing[y + 100, x_player] == 255 and include_player_messages: is_message_box_player = True if is_message_box_player: end_y = y + 10 while closing[end_y, x_player] == 255: end_y = end_y + 1 crop_img_text = closing[y + 50:end_y - 2, 420:1253] if crop_img_text.size == 0: print("failure in grabbing text from chat") y = end_y + 10 continue OCR_result = get_text_from_image(crop_img_text) # how many pixels to move downwards to find the color of the person color_person_y = 50 # get the color values of the player saying the word b, g, r = frame[y + color_person_y, x_player - 40] color_name = utils.get_color_name(r, g, b) already_logged = False for chat in ChatLogArray: # and chat.r == r and chat.g == g and chat.b == b # check if this crewmate already said this (don't log double) if color_name == chat.colorName: if chat.message == OCR_result: already_logged = True # Also check if the levenshtein distance == 1 in case of OCR errors # if the distance is only 1, then it very likely saw the same message, # which is usually caused by an OCR error reading a character the wrong way levenshtein_distance = lev(chat.message, OCR_result) if levenshtein_distance == 1: already_logged = True # if chat.r == r and chat.g == g and chat.b == b: # already_logged = True # if the chat entry doesn't exist yet. Create it if not already_logged: # now get the name through OCR # crop_img = closing[ y+50:y+100, 420:1253] crop_img = closing[y:y + 50, 420:1253] OCR_result_name = get_text_from_image(crop_img) # now put it all in the chat logger new_log = ChatLog(len(ChatLogArray), r, g, b, color_name, frame_count, OCR_result_name, OCR_result) ChatLogArray.append(new_log) # print(ChatLogArray) # cv2.floodFill() # cv2.imshow("img_outline", crop_img) y = end_y + 10