Example #1
0
    def getTreffer(self): 
        #liste von ids wird zurückgegeben
        #print "get Treffer"
        
        daten = self.Datenbank.getDataAsList("select deutsch, fremd from vokabeln where id like "+ str(self.ids))
        
        #print "vergleich zwischen "+str(daten[0][1]) +" und "+str(self.wort)
        
        if self.richtung == 1:
            if leve.distance(daten[0][1], self.wort) <= int(self.distanz) and leve.jaro(daten[0][1], self.wort) > round((self.minTreffer/100), 2):
                self.direktTreffer = True
                #print self.ids
                return [self.ids]
        else:
            if leve.distance(self.Vergeleichsfaehigkeit(daten[0][0]), self.Vergeleichsfaehigkeit(self.wort)) <= int(self.distanz) \
                    and leve.jaro(self.Vergeleichsfaehigkeit(daten[0][0]), self.Vergeleichsfaehigkeit(self.wort)) > round((self.minTreffer/100), 2):
                #print "Leven Vergleich zwischen "+ str(daten[0][0])+ " und "+ str(self.wort)
                self.direktTreffer = True
                #print self.id
                return [self.ids] 

       
        rueckgabe = []
        for i in self.liste:
            #print "Aktueller vergleich "+unicode(i[0]) +" und "+unicode(self.wort)
            if leve.distance(i[0], self.wort) <= int(self.distanz) and leve.jaro(i[0], self.wort) > 0.7:
                rueckgabe.append(i[1])
        return rueckgabe
Example #2
0
                def find_by_distance():
                    word_nl = getNormalWord(word).lower()
                    min_dist = 10
                    min_dist_word_idx = wordIdx
                    import Levenshtein
                    for i in range(0, self.mTotalEntries):
                        word_ = getNormalWord(self.getWord(i)).lower()
                        dist = Levenshtein.distance(word_nl, word_)
                        if dist == 1:
                            return i
                        if dist < min_dist:
                            min_dist = dist
                            min_dist_word_idx = i
                    for i in range(0, self.derived_dict.mTotalEntries):
                        word_ = getNormalWord(self.derived_dict.getWord(i)).lower()
                        dist = Levenshtein.distance(word_nl, word_)
                        if dist == 1:
                            words = self.derived_dict.getExplanations(word_)
                            return self.getWordIdxInternal(words[0])
                        if dist < min_dist:
                            min_dist = dist
                            words = self.derived_dict.getExplanations(word_)
                            min_dist_word_idx = self.getWordIdxInternal(words[0])

                    return min_dist_word_idx
Example #3
0
    def _match_user_agent(cls, user_agent):
        device = cls.objects.filter(user_agent=user_agent).order_by("-actual_device_root")[:1]

        if len(device):
            return device[0]
        else:
            if settings.UA_PREFIX_MATCHING:
                # Try more flexible matching, 1 third of the UA string
                ds_user_agent = user_agent[: len(user_agent) // 3]
                devices = cls.objects.filter(user_agent__startswith=ds_user_agent)
                devices = devices.order_by("-actual_device_root")[: settings.UA_PREFIX_MATCHING_LIMIT]

                if len(devices):
                    user_agent = force_unicode(user_agent)
                    best = reduce(
                        lambda x, y: Levenshtein.distance(user_agent, x.user_agent)
                        < Levenshtein.distance(user_agent, y.user_agent)
                        and x
                        or y,
                        devices,
                    )

                    if Levenshtein.distance(user_agent, best.user_agent) <= settings.UA_PREFIX_MATCHING_MAX_DISTANCE:
                        return best

            if settings.UA_GENERIC_FALLBACK:
                # Try to match with generic properties
                # :TODO:
                raise NotImplemented

        raise NoMatch, "Can't find a match in currently installed WURFL table for user_agent `%s`" % user_agent
Example #4
0
    def _match_user_agent(cls, user_agent):
        device = cls.objects.filter(user_agent=user_agent).order_by('-actual_device_root')[:1]

        if len(device):
            return device[0]
        else:
            if settings.UA_PREFIX_MATCHING:
                #~ Try more flexible matching, from 1/3rd to 1/10th of the original UA string
                #~ We break out as soon as we get a match (or matches, in which case we use Levenshtein 
                #~ distance to determine which one we want to use) or if the shortened UA string is less
                #~ than 5 characters long
                devices = None
                for factor in range(3,10):
                    if len(user_agent)/factor <= 5: break
                    devices = cls._match_partial_user_agent(user_agent,factor)
                    if len(devices): break
                
                if len(devices):
                    user_agent = force_unicode(user_agent)
                    best = reduce(
                        lambda x,y: Levenshtein.distance(user_agent, x.user_agent) < Levenshtein.distance(user_agent, y.user_agent) and x or y,
                        devices,
                    )

                    if Levenshtein.distance(user_agent, best.user_agent) <= settings.UA_PREFIX_MATCHING_MAX_DISTANCE:
                        return best
            
            if settings.UA_GENERIC_FALLBACK:
                # Try to match with generic properties
                # :TODO:
                raise NotImplementedError, 'Generic properties matching is not implemented'

        raise NoMatch, "Can't find a match in currently installed WURFL table for user_agent `%s`" % user_agent
    def get_closest_match(self,
                          cells,
                          matching_threshold,
                          suppress_non_answer_cells=False):
        """ Returns a list of cells that most closely match
            the question prompt.  If no match is better than
            the matching_threshold, the empty list will be
            returned. """
        return_value = []
        distances = [Levenshtein.distance(self.start_md, u''.join(cell['source']))
                     for cell in cells]
        if min(distances) > matching_threshold:
            return return_value

        best_match = argmin(distances)
        if self.stop_md == u"next_cell":
            end_offset = 2
        elif len(self.stop_md) == 0:
            end_offset = len(cells) - best_match
        else:
            distances = [Levenshtein.distance(self.stop_md, u''.join(cell['source']))
                         for cell in cells[best_match:]]
            if min(distances) > matching_threshold:
                return return_value
            end_offset = argmin(distances)
        if len(self.question_heading) != 0 and not suppress_non_answer_cells:
            return_value.append(NotebookExtractor.markdown_heading_cell(self.question_heading, 2))
        if not suppress_non_answer_cells:
            return_value.append(cells[best_match])
        return_value.extend(cells[best_match + 1:best_match + end_offset])
        return return_value
Example #6
0
def write_lex_stats(b, num, syll = None):
    """Use Levenshtein package to calcualte lev and count up mps, neighbors, etc"""
    total = 0.
    mps = 0
    neighbors = 0
    homophones = 0
    lev_total = 0
    for item in itertools.combinations(b, 2):
    	if syll != None:
	    	#if len(item[0].split("-"))==syll or len(item[1].split("-"))==syll:
		    lev = Levenshtein.distance(re.sub("-", "", item[0]), re.sub("-", "", item[1]))
		    if lev == 0: homophones += 1
		    elif lev == 1:
		        neighbors += 1
		        if len(re.sub("-", "", item[0])) == len(re.sub("-", "", item[1])): mps += 1
		    total += 1
		    lev_total += lev
        else:
	   		lev = Levenshtein.distance(re.sub("-", "", item[0]), re.sub("-", "", item[1]))
	   		if lev == 0: homophones += 1
	   		elif lev == 1:
	   			neighbors += 1
	   			if len(re.sub("-", "", item[0])) == len(re.sub("-", "", item[1])): mps += 1
   			total += 1
   			lev_total += lev
    print str(num)
    f.write(",".join([str(x) for x in [num, homophones, mps, neighbors, lev_total/total, len(b)] ]) + "\n")
    return
Example #7
0
def strip_bogus_lines(arg_lines):
    re_blank = re.compile(r'^\s$')
    re_page_left = re.compile(r'^\s*Page [0-9]+')
    re_page_right = re.compile(r'Page [0-9]+\s*$')
    berk_string = 'BERKELEY TRAINING ASSOCIATES © 2009\n'
    mft_string = 'MFT PRACTICE EXAMINATIONS'

    lines = []
    for line in arg_lines:
        bogosity = 0.0
        if re_blank.search(line):
            bogosity += 1.0
        if re_page_left.search(line):
            bogosity += 0.5
        if re_page_right.search(line):
            bogosity += 0.5
        l = Levenshtein.distance(line[-(len(mft_string)):], mft_string)
        if l < 5:
            bogosity += (5 - l) / 5.0
        l = Levenshtein.distance(line[:len(mft_string)], mft_string)
        if l < 5:
            bogosity += (5 - l) / 5.0
        l = Levenshtein.distance(line, berk_string)
        if l < 5:
            bogosity += (5 - l) / 5.0
        if bogosity < 0.25:
            lines.append(line)
    return lines
Example #8
0
    def compare_list(self, company_name, table_name):
        iac_hz_list = [
            [0, 1, 2, 3, 4, 5, 6]
        ]
        iac_guanwang_list = [
            [1, 1, 2, 3, 4, 5, 6]
        ]
        hz_list = [x.replace("\n", "").replace("\r", "") for x in iac_hz_list]
        guanwang_list = [x.replace("\n", "").replace("\r", "") for x in iac_guanwang_list]

        if iac_hz_list is None:
            print "haizhi system lack %s's %s." % (company_name, table_name)
            return
        if iac_guanwang_list is None:
            print "guanwang lack %s's %s." % (company_name, table_name)
            return

        if len(iac_hz_list) == len(iac_guanwang_list):
            print "the length of list %s is same : d%" % (table_name, len(iac_hz_list))
        else:
            print "the length of list %s is different : d%,d%" % (table_name, len(iac_hz_list), len(iac_guanwang_list))

        # 相同的元素数量
        num = 0
        for row in hz_list:
            if row in guanwang_list:
                # guanwang_list.remove(row)
                # hz_list.remove(row)
                num += 1
                continue
            else:
                for row2 in guanwang_list:
                    Levenshtein.distance(row, row2)
Example #9
0
def byLevenshtein(key, result_yield):
    lang = "zha"
    try:
        str(key).encode('iso-8859-1')
    except UnicodeEncodeError:
        lang = "zh"
    result_list2d = []
    if lang == "zha":
        for i in result_yield:
            result_list2d.append([Levenshtein.distance(key, i[0]), i])
    else:
        for i in result_yield:
            for j in i[1]:
                list_tmp = split("[\[\]\(\)\ \;\,\。\,\.]", j)
                list_distance = []
                if len(list_tmp) == 2:  # 2 means this entry contains only 1 word
                    distance = Levenshtein.distance(key, list_tmp[1])
                    if distance == 0:
                        list_distance.append(-1)  # -1 means the best matched one
                    else:
                        list_distance.append(distance)
                        continue
                for tmp in list_tmp:
                    if key in tmp:
                        list_distance.append(Levenshtein.distance(key, tmp))
                result_list2d.append([min(list_distance), i])
                # The method above is not so accurate,but it might work better than the previous one
    result_list2d.sort()
    for i in result_list2d:
        yield i[1]
Example #10
0
def read_type(left_read, right_read, left_enzsite, right_enzsite, left_bc, right_bc):
    """Determine if bisulfite read is watson or crick"""
    lr_enz_left = left_read[1][len(left_bc):len(left_bc)+5]
    rr_enz_right = right_read[1][len(right_bc):len(right_bc)+5]
    if left_enzsite == 'TACAA' and right_enzsite == 'TGCAG':
        return 'crick'
    elif right_enzsite == 'TACAA' and left_enzsite == 'TGCAG':
        return 'watson'
    elif right_enzsite == 'TGCAG' and left_enzsite == 'TGCAG':
        return 'gbs'
    else:
        #enzyme sites have not been establshed correctly, establish read 
        #type based on closest matching enz site and CG count.
        watson_count = left_read[1].count('G') + right_read[1].count('C') +0.001
        crick_count = left_read[1].count('C') + right_read[1].count('G') +0.001
        left_distance = Levenshtein.distance(lr_enz_left, left_enzsite)
        right_distance = Levenshtein.distance(rr_enz_right, right_enzsite)
        if left_distance < right_distance:
            #left enz_site should be leading since it has fewer mismatches.
            if left_enzsite == 'TACAA' and crick_count/float(watson_count)>2:
                return 'crick'
            else:
                return 'nodet'
        else:
            if left_enzsite == 'TGCAG' and watson_count/float(crick_count)>2:
                return 'watson'
            else:
                return 'nodet'
Example #11
0
def splitted_word_distance( pattern, text ):
    #remove trailing chars...
    words = text.split()
    lp = len(pattern)
    lw = len(words)
    if lp < lw:
        d = len(''.join(words[lp:]))
        n = lp
    else: 
        if re_type in map( lambda e: type(e), pattern[lw:] ):
            return float("inf")
        d = len(''.join(   pattern[lw:]   ))
        n = lw
    d0 = [ d ]
    for i in range(n):
        word = words[i]
        p = pattern[i]
        if type( p ) == str:
            d += levenshtein.distance( p, word )
            d0.append( levenshtein.distance( p, word ) )
        elif type( p ) == re_type:
            if not p.fullmatch( word ):
                return float("inf")
        else:
            raise Exception( "Pattern has wrong type %s" % (str(type(pattern))) )
    return d
Example #12
0
def find_similar(word):
  word_sound = fuzzy.nysiis(word)
  best = None 
  best_dist = 99999
  for w in wordlist:
    if pylev.distance(word_sound, w[1]) < best_dist and word != w[0][:-1]:
      best_dist = pylev.distance(word_sound, w[1])
      best = (w[0], best_dist)
  return best    
Example #13
0
 def testLev(self):
     s1 = "hello"
     s2 = "dog"
     s3 = "frog"
     s4 = "log"
     self.assertEqual(Lev.distance(s1,s2), 5)
     self.assertEqual(Lev.distance(s2,s3), 2)
     self.assertEqual(Lev.distance(s3,s4), 2)
     self.assertEqual(Lev.distance(s1,s1), 0)
Example #14
0
    def is_tRNA(self, seq):
        """Takes a sequence and determines whether or not it matches the
        criterion for being a tRNA
        """
        length = len(seq)
        sub_size = 24
        t_loop_error = True
        acceptor_error = True
        cur_seq_specs = SeqSpecs()

        # Start the sliding window at the last 24 bases, and move to the left
        # one at a time
        for i in range(length - sub_size + 1):
            sub_str = seq[-(i + sub_size):(length - i)]
            t_loop_seq = sub_str[0:9]
            acceptor_seq = sub_str[-3:]
            t_loop_dist = (lev.distance("GTTC", sub_str[0:4])
                + lev.distance("C", sub_str[8]))
            acceptor_dist = lev.distance("CCA", sub_str[-3:])
            mis_count = t_loop_dist + acceptor_dist

            if t_loop_dist < 1:
                t_loop_error = False
            else:
                t_loop_error = True
            if acceptor_dist < 1:
                acceptor_error = False
            else:
                acceptor_error = True
            if mis_count < cur_seq_specs.mis_count:
                cur_seq_specs.length = length
                cur_seq_specs.mis_count = mis_count
                cur_seq_specs.t_loop_error = t_loop_error
                cur_seq_specs.acceptor_error = acceptor_error
                cur_seq_specs.seq = seq
                cur_seq_specs.seq_sub = sub_str
                cur_seq_specs.t_loop_seq = t_loop_seq
                cur_seq_specs.acceptor_seq = acceptor_seq
            if mis_count < 2:
                cur_seq_specs = self.handle_pass_seq(cur_seq_specs, i)
                res_tup = (True, cur_seq_specs)
                return res_tup

        # Handles a failed sequence
        if cur_seq_specs.t_loop_error and cur_seq_specs.acceptor_error:
            if length < 24:
                self.stats_dict['short_rejected'] += 1
            else:
                self.stats_dict['both_rejected'] += 1
        elif cur_seq_specs.acceptor_error and not cur_seq_specs.t_loop_error:
            self.stats_dict['acceptor_seq_rejected'] += 1
        elif cur_seq_specs.t_loop_error and not cur_seq_specs.acceptor_error:
            self.stats_dict['t_loop_seq_rejected'] += 1
        self.stats_dict['total_rejected'] += 1
        res_tup = (False, cur_seq_specs)

        return res_tup
Example #15
0
def nameSort( key, character ):
    length = Levenshtein.distance(key, character['name'])
    if character.get('real_name'):
        n = Levenshtein.distance(key, character['real_name'])
        length = n if n < length else length
    if character.get('aliases'):
        for alias in character['aliases'].split():
            n = Levenshtein.distance(key, alias)
            length = n if n < length else length
    return length
 def check_search_terms(self, search_term):
     global search_term_relevance_list        
     distance = 1000
     relevance = 1
     for terms in search_term_relevance_list:
         if Levenshtein.distance(search_term, terms.search_term) < distance:
             distance = Levenshtein.distance(search_term, terms.search_term)
             relevance = terms.relevance
     
     return relevance
def identify_primer_with_mismatches(seq, fw, rev, max_mismatch=8):
    for primer in fw:
        d = Levenshtein.distance(seq[: len(primer)], primer)
        if d < max_mismatch:
            return +1, primer
    for primer in rev:
        d = Levenshtein.distance(seq[: len(primer)], primer)
        if d < max_mismatch:
            return -1, primer
    return None, None
Example #18
0
	def writeLevenshteinDistance(self, termIndex, dictionary, dictionaryLen, levThr, corpus_writer):

		term, termLen = dictionary[termIndex]
		#termLen = len(term)

		# Compute candidates for levenshtein distance of term from termToRowIndex.
		#candidates = []
		candidates = [None] * dictionaryLen
		i = 0
		for candidateId, (candidate, candidateLen) in dictionary.iteritems():
			#candidate, candidateLen = dictionary[candidateId]
			
			# Add candidate if the difference between termLen and candidateLen is less or equal than 0.5*maxlen.
			if termLen >= candidateLen:
				diff = termLen - candidateLen
				if diff <= 0.5*termLen:
					candidates[i] = (candidateId, candidate, candidateLen)
					i += 1
			else:
				diff = candidateLen - termLen
				if diff <= 0.5*candidateLen:
					candidates[i] = (candidateId, candidate, candidateLen)
					i += 1
		
		#Grab the sublist excluding the preallocated values
		candidates = candidates[0:i]
		
			#if 2*candidateLen >= termLen:
			#	candidates.append((candidateId, candidate, candidateLen))
			#elif 2*termLen >= candidateLen:
			#	candidates.append((candidateId, candidate, candidateLen))
		#candidates = [(candidateId, dictionary[candidateId]) for candidateId in dictionary.keys() if 2*len(dictionary[candidateId]) >= termLen]
		#candidates += [(candidateId, dictionary[candidateId]) for candidateId in dictionary.keys() if 2*termLen >= len(dictionary[candidateId])]
		#candidates = set(candidates)
		
		# Compute the values
		#sims = []
		sims = [None] * dictionaryLen
		i = 0
		for candidateId, candidate, candidateLen in candidates:
			sim = 0.0
			#Compute distance between candidate and term
			if termLen >= candidateLen: #Split on len to prevent the use of max, which is awfully slow.
				sim = 1.0 - float(Levenshtein.distance(candidate, term))/termLen #Compute Levensthein distance
			else:
				sim = 1.0 - float(Levenshtein.distance(candidate, term))/candidateLen #Compute Levensthein distance
			if sim >= levThr:
				sims[i] = (candidateId, sim)
				i += 1
					
		#Grab the sublist excluding the preallocated values
		sims = sims[0:i]
		
		max_id, veclen = corpus_writer.write_vector(termIndex, sims)
		return veclen
def get_edit_dist(input_file, output_file, target_seq):
    fi = open (input_file,'r')
    fo = open (output_file, 'w')
    fo.write("Chr\tLocation\tForward29\tReverse29\tEdit dist for\tEdit dist rev\tStep for\tStep rev\tDeletion# for\tDeletion# rev\tBulge dist for\tBulge dist rev\n")
    for line in fi.xreadlines():
        units = line.split()
        chrm = units[0]
        loc = units[1]
        forseq = units[2]
        revseq = units[3]
        
        value = {}
        for nuc in "ATGC":
            target_seq.replace('N',nuc)
            value[nuc] = (l.distance(target_seq,forseq),l.editops(target_seq,forseq))
        for_max = max(value, key=value.get)
        for_dist, for_editops = value[for_max]
        for_step=[]        
        for each in for_editops:
            a,b,c =each
            for_step.append(a)
        for_deletion=for_step.count('delete')
        value = {}
        for nuc in "ATGC":
            target_seq.replace('N',nuc)
            value[nuc] = (l.distance(target_seq,revseq),l.editops(target_seq,revseq))
        rev_max = max(value, key=value.get)
        rev_dist, rev_editops = value[rev_max]
        rev_step=[]        
        for each in rev_editops:
            a,b,c=each
            rev_step.append(a)
        rev_deletion=rev_step.count('delete')
        #print int(rev_deletion)
        
        bulge_l = [for_dist+for_deletion*2,rev_dist+rev_deletion*2]
        del_l = [for_deletion,rev_deletion]
        fo.write(
        "%s\t%s\t\
        %s\t%s\t\
        %d\t%d\t\
        %s\t%s\t\
        %d\t%d\t\
        %d\t%d\t\
        %d\t%d\n"%
        (chrm,loc,
         forseq,revseq,
         for_dist,rev_dist,
         for_editops,rev_editops,
         for_deletion,rev_deletion,
         bulge_l[0],bulge_l[1],
         min(bulge_l),del_l[bulge_l.index(min(bulge_l))]))
        #print rev_dist+rev_deletion*4
    fi.close();fo.close()
def get_closest_distances(flu_strings, host_strings):
    """For each flu sequence, find the closet distance to a human string"""

    min_distances = []

    for flu_s in flu_strings:
        dis = Levenshtein.distance(flu_s, 
                                   host_strings[0])
        for host_s in host_strings[1:]:
            d = Levenshtein.distance(flu_s, 
                                     host_s)
            dis = min([dis, d])
        print dis
Example #21
0
def match_something(item, list):
    item = item.replace(" ","")
    item = item.replace(".", "")
    item = item.replace(",", "")
    lowest = list[0]
    lowestdelta = Levenshtein.distance(item, list[0])
    for entry in list:
        delta = Levenshtein.distance(item, entry)
        if delta < lowestdelta:
            lowestdelta = delta
            lowest = entry

    print(delta, item, entry)
    return lowest
Example #22
0
def soundex_distance(ovv_snd,cand):
    try:
        lev = Levenshtein.distance(unicode(ovv_snd),soundex.soundex(cand.decode("utf-8","ignore")))
    except UnicodeEncodeError:
        print('UnicodeEncodeError[ovv_snd]: %s %s' % (ovv_snd,cand))
        lev = Levenshtein.distance(ovv_snd,soundex.soundex(cand.encode("ascii","ignore")))
    except UnicodeDecodeError:
        print('UnicodeDecodeError[ovv_snd]: %s %s' % (ovv_snd,cand))
        lev = Levenshtein.distance(ovv_snd,soundex.soundex(cand.decode("ascii","ignore")))
    except TypeError:
        print ('TypeError[ovv_snd]: %s %s' % (ovv_snd,cand))
        lev = 10.
    snd_dis = lev
    return snd_dis
def checkDifferentDescriptions (clientinfo, serverinfo):
    try:
        maxDistance = sys.argv[1]
    except:
        maxDistance = 5

    if clientinfo['name'] != serverinfo['name'] and Levenshtein.distance(clientinfo['name'], serverinfo['name']) <= maxDistance:
        output(clientinfo, serverinfo)
    elif clientinfo['name'] != serverinfo['name'] and clientinfo['name'].lower() == serverinfo['name'].lower():
        output(clientinfo, serverinfo)
    elif clientinfo['desc'] != serverinfo['desc'] and Levenshtein.distance(clientinfo['desc'], serverinfo['desc']) <= maxDistance:
        output(clientinfo, serverinfo)
    elif clientinfo['desc'] != serverinfo['desc'] and clientinfo['desc'].lower() == serverinfo['desc'].lower():
        output(clientinfo, serverinfo)
Example #24
0
def test(clf):
    dvds = []
    with open("dvd.csv") as f:
        for i, j in enumerate(f):
            dvds.append(j)

    movies = []
    with open("movies.csv") as f:
        for i, j in enumerate(f):
            movies.append(j)

    dvds = [dvd for dvd in dvds if dvd > "B"]
    movies = [movie for movie in movies if movie > "B"]
    print(len(dvds), len(movies))

    with open("test.csv", "w") as f:
        i = 0
        for dvd in dvds:
            prefix = dvd[0]
            i += 1
            maxSimil = 0.0
            for movie in movies:
                if movie[0] == prefix:
                    tempSim = lev.jaro(dvd, movie)
                    if tempSim > maxSimil:
                        maxSimil = tempSim
                        maxMovie = movie

            temp = [
                1.0 - (lev.distance(dvd, maxMovie) / len(dvd)),
                lev.jaro(dvd, maxMovie),
                lev.jaro_winkler(dvd, maxMovie),
                lev.ratio(dvd, maxMovie),
            ]
            print("%s\t%s\t%f\t%f" % (dvd.rstrip(), maxMovie.rstrip(), clf.decision_function(temp), clf.predict(temp)))
            f.write(
                "%s\t%s\t%f\t%f\t%f\t%f\t%f\t%i\n"
                % (
                    dvd.rstrip(),
                    maxMovie.rstrip(),
                    1.0 - (lev.distance(dvd, maxMovie) / len(dvd)),
                    lev.jaro(dvd, maxMovie),
                    lev.jaro_winkler(dvd, maxMovie),
                    lev.ratio(dvd, maxMovie),
                    clf.decision_function(temp),
                    clf.predict(temp),
                )
            )
Example #25
0
def filter_hits_by_distance(hits, source_text,
                            min_similarity=DEFAULT_MIN_SIMILARITY):
    """Returns ES `hits` filtered according to their Levenshtein distance
    to the `source_text`.

    Any hits with a similarity value (0..1) lower than `min_similarity` will be
    discarded. It's assumed that `hits` is already sorted from higher to lower
    score.
    """
    if min_similarity <= 0 or min_similarity >= 1:
        min_similarity = DEFAULT_MIN_SIMILARITY

    filtered_hits = []
    for hit in hits:
        hit_source_text = hit['_source']['source']
        distance = Levenshtein.distance(source_text, hit_source_text)
        similarity = (
            1 - distance / float(max(len(source_text), len(hit_source_text)))
        )

        logger.debug(
            'Similarity: %.2f (distance: %d)\nOriginal:\t%s\nComparing with:\t%s',
            similarity, distance, source_text, hit_source_text
        )

        if similarity < min_similarity:
            break

        filtered_hits.append(hit)

    return filtered_hits
 def __searchNbestNodeMatchRestricted__(self, segm_s, segm_best, valid_nodes, is_prefix):
     # TODO:
     segm = " ".join(segm_s).strip()
     n = len(segm)
     # max_lsc = float("-inf")
     # max_node = None
     
     nbest_nodes = []
     for n_idx in valid_nodes:
         covered_sent = self.__nodes__[n_idx].getCoveredString().strip()
         if not is_prefix or (is_prefix and covered_sent[0:3] == "<s>"): 
             covered_sent = covered_sent.replace("|UNK|UNK|UNK","").replace("<s>","").replace("</s>","").strip()
             d = Levenshtein.distance(segm,covered_sent)
             d = min(d,n)
             err_lsc = d*log(self.err_p) + (n-d)*log(1.0-self.err_p) + log(fact(n))-(log(fact(d))+log(fact(n-d)))
             itp_lsc = self.__nodes__[n_idx].getInsideLogScore()+self.__nodes__[n_idx].getOutsideLogScore()
             cur_lsc = itp_lsc+self.err_w*err_lsc #inside x outside x err**err_w
             # if cur_lsc > max_lsc:
             #     max_lsc = cur_lsc
             #     max_itp_lsc = itp_lsc
             #     max_node = n_idx
             if len(nbest_nodes)<=segm_best or cur_lsc>nbest_nodes[0][0]:
                 nbest_nodes.append((cur_lsc,itp_lsc,n_idx))
                 nbest_nodes = sorted(nbest_nodes)[-segm_best:]
                 #print nbest_nodes
     #return max_lsc,max_itp_lsc,max_node
     still_more_options = True
     if len(nbest_nodes)<segm_best:
         still_more_options = False
     return nbest_nodes[0],still_more_options
    def clean(self):
        """
Validates that old and new password are not too similar.
"""
        cleaned_data = super(PasswordPoliciesChangeForm, self).clean()
        old_password = cleaned_data.get("old_password")
        new_password1 = cleaned_data.get("new_password1")

        if old_password and new_password1:
            if old_password == new_password1 and \
                not settings.PASSWORD_USE_HISTORY:
                raise forms.ValidationError(
                        self.error_messages['password_identical'])
            else:
                if settings.PASSWORD_DIFFERENCE_DISTANCE:
                    try:
                        import Levenshtein
                    except ImportError:
                        pass
                    else:
                        distance = Levenshtein.distance(old_password,
                                                        new_password1)
                        if distance < settings.PASSWORD_DIFFERENCE_DISTANCE:
                            raise forms.ValidationError(
                                    self.error_messages['password_similar'])
        return cleaned_data
Example #28
0
 def __searchBestNodeMatch__(self, pref_s):
     if len(pref_s)==0:
         node=self.__nodes__[self.__init_node__]
         ec_lsc = node.getInsideLogScore()+node.getOutsideLogScore()
         return self.__init_node__,ec_lsc,ec_lsc
     
     pref = " ".join(pref_s).strip()
     n = len(pref)
     max_lsc = float("-inf")
     max_node = None
     ordered_keys = sorted(self.__nodes__)
     #inside x outside x err
     for n_idx in ordered_keys:
         covered_sent = self.__nodes__[n_idx].getCoveredString().strip()
         if covered_sent[0:3] == "<s>": #consider only nodes covering a prefix
             covered_sent = covered_sent.replace("|UNK|UNK|UNK","").replace("<s>","").replace("</s>","").strip()
             d = Levenshtein.distance(pref,covered_sent)
             d = min(d,n)
             err_lsc = d*log(self.err_p) + (n-d)*log(1.0-self.err_p) + log(fact(n))-(log(fact(d))+log(fact(n-d)))
             itp_lsc = self.__nodes__[n_idx].getInsideLogScore()+self.__nodes__[n_idx].getOutsideLogScore()
             cur_lsc = itp_lsc+self.err_w*err_lsc
             if cur_lsc > max_lsc:
                 max_lsc = cur_lsc
                 max_itp_lsc = itp_lsc
                 max_node = n_idx
                 # print "\n-----------------------"
             #     print pref,"#",covered_sent,"#",n,"->",d
             #     print max_lsc, max_itp_lsc, err_lsc
             #     print self.__nodes__[max_node]
             #     print "-----------------------\n"
             # else:
             #     print " -->","#"+covered_sent+"#",d,cur_lsc,itp_lsc,err_lsc
     return max_node,max_lsc,max_itp_lsc
def levenshtein(string1, string2):
    """
    Computes the Levenshtein distance between two strings.

    Levenshtein distance computes the minimum cost of transforming one string into the other. Transforming a string
    is carried out using a sequence of the following operators: delete a character, insert a character, and
    substitute one character for another.

    Args:
        string1,string2 (str): Input strings

    Returns:
        Levenshtein distance (int)

    Raises:
        TypeError : If the inputs are not strings

    Examples:
        >>> levenshtein('a', '')
        1
        >>> levenshtein('example', 'samples')
        3
        >>> levenshtein('levenshtein', 'frankenstein')
        6


    Note:
        This implementation internally uses python-levenshtein package to compute the Levenshtein distance

    """
    # input validations
    utils.sim_check_for_none(string1, string2)
    utils.sim_check_for_string_inputs(string1, string2)
    # using Levenshtein library
    return Levenshtein.distance(string1, string2)
Example #30
0
    def norm_levenshtein(str1, str2):
        max_len = float(max([len(str1), len(str2)]))

        try:
            return Levenshtein.distance(str1, str2) / max_len
        except ZeroDivisionError:  # both sections are instrumental
            return 0
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import re
import requests
import json
import os
import csv
import Levenshtein
from args import get_name_similair_args
from tqdm import tqdm
'''
Due to there are name abbrevation or middle name in English e.g.Romit Jitendra Shah & Romit J. Shah, Daniel Durn, Daniel J. Durn
so caculate the similar ratio to decide whether two different name is same

four types of number of similar raito
Levenshtein.distance(s1, s2) Compute absolute Levenshtein distance of two strings.
Levenshtein.ratio(s1, s2) The similarity is a number between 0 and 1, it's usually equal or somewhat higher than difflib.SequenceMatcher.ratio(), because it's based on real minimal edit distance.
Levenshtein.jaro(s1, s2) The Jaro-Winkler string similarity metric is a modification of Jaro metric giving more weight to common prefix, as spelling mistakes are more likely to occur near ends of words. The prefix weight is inverse value of common prefix length sufficient to consider the strings *identical*. If no prefix weight is specified, 1/10 is used.
Levenshtein.jaro_winkler(s1, s2) The result is a list of triples with the same meaning as in SequenceMatcher's get_matching_blocks() output. It can be used with both editops and opcodes. The second and third arguments don't have to be actually

output result csv in similar result in folder

usge :
python name_similar_ratio.py --company='AMAT' --token='yourtoken'
'''


def folder_builder(outputfolder):
    if not os.path.isdir(outputfolder):
        os.mkdir(outputfolder)
Example #32
0
def eval_evt(etdata_gt, etdata_pr, n_events):

    t = time.time()
    if etdata_gt.evt is None:
        etdata_gt.calc_evt(fast=True)
    if etdata_pr.evt is None:
        etdata_pr.calc_evt(fast=True)

    #levenshtein distance
    evt_gt = etdata_gt.evt['evt']
    evt_gt = evt_gt[~(evt_gt==0)]
    evt_pr = etdata_pr.evt['evt']
    evt_pr = evt_pr[~(evt_pr==0)]
    wer = Lev.distance(''.join(map(str, evt_gt)),
                       ''.join(map(str, evt_pr)))/\
                       float(len(evt_gt))

    _cer = map(lambda _a, _b: Lev.distance(_a, _b),
               ''.join(map(str, etdata_gt.data['evt'])).split('0'),
               ''.join(map(str, etdata_pr.data['evt'])).split('0'))
    mask=etdata_gt.data['evt']==0
    evt_len = float(sum(~mask))
    cer = sum(_cer)/evt_len

    #sample level K
    t = time.time()
    evts_gt_oh = convertToOneHot(etdata_gt.data['evt'], n_events)
    evts_pr_oh = convertToOneHot(etdata_pr.data['evt'], n_events)
    ks = [calc_k(evts_gt_oh[:,i], evts_pr_oh[:,i]) for i in range(1, n_events)]

    evt_gt = etdata_gt.data['evt']
    evt_gt = evt_gt[~(evt_gt==0)]
    evt_pr = etdata_pr.data['evt']
    evt_pr = evt_pr[~(evt_pr==0)]
    ks_all = metrics.cohen_kappa_score(evt_gt, evt_pr)

    ks.extend([ks_all])

    #event level K and F1
    try:
        t = time.time()

        ke_ = []
        f1e_ = []
        for evt in range(1, 4):
            #evt=1
            _etdata_gt = copy.deepcopy(etdata_gt)
            mask_ext = _etdata_gt.data['evt']==0
            mask = _etdata_gt.data['evt']==evt
            _etdata_gt.data['evt'][mask]=1
            _etdata_gt.data['evt'][~mask]=0
            _etdata_gt.data['evt'][mask_ext]=255
            _etdata_gt.calc_evt(fast=True)

            _etdata_pr = copy.deepcopy(etdata_pr)
            mask_ext = _etdata_pr.data['evt']==0
            mask = _etdata_pr.data['evt']==evt
            _etdata_pr.data['evt'][mask]=1
            _etdata_pr.data['evt'][~mask]=0
            _etdata_pr.data['evt'][mask_ext]=255
            _etdata_pr.calc_evt(fast=True)

            evt_overlap, evt_gt, evt_pr = calc_KE(_etdata_gt, _etdata_pr)
            mask = (evt_gt==255) & (evt_pr==255)
            evt_gt = evt_gt[~mask]
            evt_pr = evt_pr[~mask]
            ke_.append(calc_k(evt_gt, evt_pr))
            f1e_.append(calc_f1(evt_gt, evt_pr))


        evt_overlap, evt_gt, evt_pr = calc_KE(etdata_gt, etdata_pr)
        mask = (evt_gt==0) & (evt_pr==0)
        evt_gt = evt_gt[~mask]
        evt_pr = evt_pr[~mask]
        #print ('[overlap], dur %.2f' % (time.time()-t))
        evt_gt_oh = convertToOneHot(evt_gt, n_events)
        evt_pr_oh = convertToOneHot(evt_pr, n_events)
        ke = [calc_k(evt_gt_oh[:,i], evt_pr_oh[:,i]) for i in range(1, n_events)]
        f1e = [calc_f1(evt_gt_oh[:,i], evt_pr_oh[:,i]) for i in range(1, n_events)]

        ke_all = metrics.cohen_kappa_score(evt_gt, evt_pr)
        f1_all = metrics.f1_score(evt_gt, evt_pr, average='weighted')
        ke.extend([ke_all])
        ke_.extend([ke_all])
        f1e.extend([f1_all])
        f1e_.extend([f1_all])
        #print ('[KE], dur %.2f' % (time.time()-t))
    except:
        #TODO: Debug
        print ("Could not calculate event level k")
        ks = [0.,]*(n_events+1)
        ke = [0.,]*(n_events+1)
        f1e = [0.,]*(n_events+1)


    return wer, cer, ke_, ks, f1e_, (evt_overlap, evt_gt, evt_pr)
Example #33
0
def word_distance(s, t):
    s = s.lower().strip()
    t = t.lower().strip()
    d = Levenshtein.distance(s, t)
    return d
def calcDist(line):
    fileName = line.strip()
    dist = Levenshtein.distance(fileName, askingFor)
    return (dist, fileName)
Example #35
0
def do_logic(file_content):
    for input_line in file_content:
        string1, string2 = input_line.split(',')
        diff = Levenshtein.distance(string1, string2)
        if OUTPUT:
            print('%s %s %d' % (string1, string2, diff))
Example #36
0
def get_org_infor(org1, org2):
    p_org1 = preprocessorg(org1)
    p_org2 = preprocessorg(org2)
    return (len(p_org1), len(p_org2), lv.distance(p_org1, p_org2))
Example #37
0
def levenshtein(a, b):
    return -Levenshtein.distance(a, b)
Example #38
0
    'the', 'be', 'to', 'of', 'and', 'a', 'in', 'that', 'have', 'I', 'it',
    'for', 'not', 'on', 'with', 'he', 'as', 'you', 'do', 'at', 'this', 'but',
    'his', 'by', 'from', 'they', 'we', 'say', 'her', 'she', 'or', 'an', 'will',
    'my', 'one', 'all', 'would', 'there', 'their', 'what', 'so', 'up', 'out',
    'if', 'about', 'who', 'get', 'which', 'go', 'me', 'when', 'make', 'can',
    'like', 'time', 'no', 'just', 'him', 'know', 'take', 'people', 'into',
    'year', 'your', 'good', 'some', 'could', 'them', 'see', 'other', 'than',
    'then', 'now', 'look', 'only', 'come', 'its', 'over', 'think', 'also',
    'back', 'after', 'use', 'two', 'how', 'our', 'work', 'first', 'well',
    'way', 'even', 'new', 'want', 'because', 'any', 'these', 'give', 'day',
    'most', 'us'
])

print "calculating distances..."

(dim, ) = words.shape

f = lambda (x, y): -leven.distance(x, y)

res = np.fromiter(itertools.imap(f, itertools.product(words, words)),
                  dtype=np.uint8)
A = np.reshape(res, (dim, dim))

af = AffinityPropagation().fit(A)
cluster_centers_indices = af.cluster_centers_indices_
labels = af.labels_

unique_labels = set(labels)
for i in unique_labels:
    print words[labels == i]
Example #39
0
forms_set = set()

with open('./polimorfologik-2.1/polimorfologik-2.1.txt', 'r',
          encoding='utf-8') as file:
    reader = csv.reader(file, delimiter=';')
    for row in reader:
        forms_set.add(row[1].lower())

not_existing_words = []

for word in data_counter.most_common():
    if word[0] not in forms_set:
        not_existing_words.append(word)

pprint(not_existing_words[:30])

triple_missing = [x for x in not_existing_words if x[1] == 3]

pprint(triple_missing[:30])

for word in triple_missing[:30]:
    corect = []
    for form in forms_set:
        lev_dist = Levenshtein.distance(word[0], form)
        if lev_dist < 4:
            corect.append((form, lev_dist))

    corect.sort(key=lambda x: x[1])
    print(word)
    pprint(corect[:3])
Example #40
0
def fit(model, epochs, train_data_loader, valid_data_loader):
    best_leven = 1000
    optimizer = optim.AdamW(model.parameters(), 5e-4)
    len_train = len(train_data_loader)
    loss_func = nn.CTCLoss(blank=len(classes)).to(dev)
    for i in range(1, epochs + 1):
        # ============================================ TRAINING ========================================================
        batch_n = 1
        train_levenshtein = 0
        len_levenshtein = 0
        for spectrograms, labels, input_lengths, label_lengths in tqdm(
                train_data_loader,
                position=0,
                leave=True,
                file=sys.stdout,
                bar_format="{l_bar}%s{bar}%s{r_bar}" %
            (Fore.GREEN, Fore.RESET)):
            model.train()
            spectrograms, labels = spectrograms.to(dev), labels.to(dev)
            optimizer.zero_grad()
            loss_func(
                model(spectrograms).log_softmax(2).permute(1, 0, 2), labels,
                input_lengths, label_lengths).backward()
            optimizer.step()
            # ================================== TRAINING LEVENSHTEIN DISTANCE =========================================
            if batch_n > (len_train - 5):
                model.eval()
                with torch.no_grad():
                    decoded = model.beam_search_with_lm(spectrograms)
                    for j in range(0, len(decoded)):
                        actual = num_to_str(labels.cpu().numpy()[j]
                                            [:label_lengths[j]].tolist())
                        train_levenshtein += leven.distance(decoded[j], actual)
                        len_levenshtein += label_lengths[j]

            batch_n += 1
        # ============================================ VALIDATION ======================================================
        model.eval()
        with torch.no_grad():
            val_levenshtein = 0
            target_lengths = 0
            for spectrograms, labels, input_lengths, label_lengths in tqdm(
                    valid_data_loader,
                    position=0,
                    leave=True,
                    file=sys.stdout,
                    bar_format="{l_bar}%s{bar}%s{r_bar}" %
                (Fore.BLUE, Fore.RESET)):
                spectrograms, labels = spectrograms.to(dev), labels.to(dev)
                decoded = model.beam_search_with_lm(spectrograms)
                for j in range(0, len(decoded)):
                    actual = num_to_str(
                        labels.cpu().numpy()[j][:label_lengths[j]].tolist())
                    val_levenshtein += leven.distance(decoded[j], actual)
                    target_lengths += label_lengths[j]

        print('Epoch {}: Training Levenshtein {} | Validation Levenshtein {}'.
              format(i, train_levenshtein / len_levenshtein,
                     val_levenshtein / target_lengths),
              end='\n')
        # ============================================ SAVE MODEL ======================================================
        if (val_levenshtein / target_lengths) < best_leven:
            torch.save(model.state_dict(),
                       f=str((val_levenshtein / target_lengths) * 100).replace(
                           '.', '_') + '_' + 'model.pth')
            best_leven = val_levenshtein / target_lengths
import pandas as pd
import numpy as np
import Levenshtein
import random

random.seed(12345)

d = pd.read_csv("../data/asjp19wide.csv", index_col=0)


words = d.values[~d.isnull()]
words = np.concatenate([w.split('-') for w in words])

tests = pd.DataFrame(columns=['word1', 'word2', 'LD'])


for i in range(1000):
    if i % 100 == 0:
        print(i)
    w1, w2 = random.sample(list(words), 2)
    tests.loc[i] = [w1, w2, Levenshtein.distance(w1, w2)]


tests.to_csv('levenshteinTests.csv', index=False)
def eval_ctc(model, dataloader, idx2char):
    model.eval()

    t = time.time()
    total_dist = 0
    total_line_err = 0
    total_ratio = 0
    total_pred_char = 1
    total_label_char = 0
    total_samples = 0
    total_ned = 0

    for j, batch in enumerate(dataloader):
        imgs = batch[0].cuda()
        labels_length = batch[1].cuda()
        labels_str = batch[2]

        with torch.no_grad():
            outputs_ctc, _ = model(imgs)
            # outputs_att = decoder(sqs, label_att)

        prob = outputs_ctc.softmax(dim=2).cpu().numpy()
        pred = prob.argmax(axis=2)

        for k in range(pred.shape[1]):
            pred_str = ""
            prev = " "
            for t in pred[:,k]:
                if idx2char[t] != prev:
                    pred_str += idx2char[t]
                    prev = idx2char[t]
            
            pred_str = pred_str.strip()
            pred_str = pred_str.replace('-', '')

            dist = Levenshtein.distance(pred_str, labels_str[k])
            total_dist += dist
            ratio = Levenshtein.ratio(pred_str, labels_str[k])
            total_ratio += ratio
            total_ned += float(dist) / max(len(pred_str), len(labels_str[k]))

            total_pred_char += len(pred_str)
            total_label_char += len(labels_str[k])
            total_samples += 1

            if dist != 0: 
                total_line_err += 1
                print('pred: ', pred_str)
                print('label:', labels_str[k])
                    
    precision = 1.0 - float(total_dist) / total_pred_char
    recall = 1.0 - float(total_dist) / total_label_char
    ave_Levenshtein_ratio = float(total_ratio) / total_samples
    line_acc = 1.0 - float(total_line_err) / total_samples
    rec_score = 1.0 - total_ned / total_samples       
    print("precision: %f" % precision)
    print("recall: %f" % recall)
    print("ave_Levenshtein_ratio: %f" % ave_Levenshtein_ratio)
    print("line_acc: %f" % line_acc)
    print("rec_score: %f" % rec_score)
    return line_acc, rec_score
Example #43
0
import synonyms

fns = sys.argv[1:]

errors = 0
warns = 0
for fn in fns:
    with open(fn, mode="r") as f:
        vegetables = f.read().strip().split('\n')

    for _seq1, _seq2 in combinations(enumerate(vegetables, 1), 2):
        idx1, seq1 = _seq1
        idx2, seq2 = _seq2
        min_len = min(len(seq1), len(seq2))

        dist = Levenshtein.distance(seq1, seq2)
        if dist < 2 and min_len > 2:
            errors += 1
            sys.stderr.write("\n".join([
                f"{fn}:{idx1}:{idx2}: ERROR: Duplicate sentences", seq1, seq2,
                f"(distance={dist})", ""
            ]))
        elif dist < 6:
            sim = synonyms.compare(seq1, seq2)
            if sim > 0.9:
                warns += 1
                sys.stderr.write("\n".join([
                    f"{fn}:{idx1}:{idx2}: WARNING: Possible duplicate sentences",
                    seq1, seq2, f"(distance={dist}, similarity={sim})", ""
                ]))
    def attack(self, epsilon, alpha, attack_type="FGSM", PGD_round=40):
        print("Start attack")

        data, target = self.sound.to(self.device), self.target.to(self.device)
        data_raw = data.clone().detach()

        # initial prediction
        spec = torch_spectrogram(data, self.torch_stft)
        input_sizes = torch.IntTensor([spec.size(3)]).int()
        out, output_sizes = self.model(spec, input_sizes)
        decoded_output, decoded_offsets = self.decoder.decode(
            out, output_sizes)
        original_output = decoded_output[0][0]
        print(f"Original prediction: {decoded_output[0][0]}")

        # ATTACK
        ############ ATTACK GENERATION ##############
        if attack_type == "FGSM":
            data.requires_grad = True

            spec = torch_spectrogram(data, self.torch_stft)
            input_sizes = torch.IntTensor([spec.size(3)]).int()
            out, output_sizes = self.model(spec, input_sizes)
            out = out.transpose(0, 1)  # TxNxH
            out = out.log_softmax(2)
            loss = self.criterion(out, self.target, output_sizes,
                                  self.target_lengths)

            self.model.zero_grad()
            loss.backward()
            data_grad = data.grad.data

            perturbed_data = self.fgsm_attack(data, epsilon, data_grad)

        elif attack_type == "PGD":
            for i in range(PGD_round):
                print(f"PGD processing ...  {i+1} / {PGD_round}", end="\r")
                data.requires_grad = True

                spec = torch_spectrogram(data, self.torch_stft)
                input_sizes = torch.IntTensor([spec.size(3)]).int()
                out, output_sizes = self.model(spec, input_sizes)
                out = out.transpose(0, 1)  # TxNxH
                out = out.log_softmax(2)
                loss = self.criterion(out, self.target, output_sizes,
                                      self.target_lengths)

                self.model.zero_grad()
                loss.backward()
                data_grad = data.grad.data

                data = self.pgd_attack(data, data_raw, epsilon, alpha,
                                       data_grad).detach_()
            perturbed_data = data
        ############ ATTACK GENERATION ##############

        # prediction of adversarial sound
        spec = torch_spectrogram(perturbed_data, self.torch_stft)
        input_sizes = torch.IntTensor([spec.size(3)]).int()
        out, output_sizes = self.model(spec, input_sizes)
        decoded_output, decoded_offsets = self.decoder.decode(
            out, output_sizes)
        final_output = decoded_output[0][0]

        perturbed_data = perturbed_data.detach()
        abs_ori = 20 * np.log10(
            np.sqrt(np.mean(np.absolute(data_raw.cpu().numpy())**2)))
        abs_after = 20 * np.log10(
            np.sqrt(np.mean(np.absolute(perturbed_data.cpu().numpy())**2)))
        db_difference = abs_after - abs_ori
        l_distance = Levenshtein.distance(self.target_string, final_output)
        print(f"Max Decibel Difference: {db_difference:.4f}")
        print(f"Adversarial prediction: {decoded_output[0][0]}")
        print(f"Levenshtein Distance {l_distance}")
        if self.save:
            torchaudio.save(self.save,
                            src=perturbed_data.cpu(),
                            sample_rate=self.sample_rate)
        self.perturbed_data = perturbed_data
        return db_difference, l_distance, self.target_string, final_output
Example #45
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Jun 15 20:38:46 2019

@author: anchor
"""

#文本编辑距离
import Levenshtein
a = Levenshtein.distance('abhg', 'bgj')
print(a)

from sklearn.cluster import KMeans

X = [[0.262, 4], [0.192, 4], [4.052, 1.98], [1, 19.59], [2, 3.5], [0.78, 10.6],
     [2, 10.5], [2.038, 7.38], [0.574, 11.6], [5, 1.06], [4.43, 4.78],
     [0.514, 43], [0.592, 31], [1, 16.2], [1, 7.39], [1, 95.9], [1, 23.29],
     [1, 12.8], [1.338, 2.6], [0.46, 19]]

# Kmeans聚类
clf = KMeans(n_clusters=4)
y_pred = clf.fit_predict(X)
print(clf)
print(y_pred)

import matplotlib.pyplot as plt

x = [n[0] for n in X]
y = [n[1] for n in X]
Example #46
0
early_stopping = EarlyStopping(monitor='val_loss', patience=1, verbose=1)
model.fit_generator(
    train_generator.forfit(),
    steps_per_epoch=len(train_generator),
    epochs=5,
    validation_data=valid_generator.forfit(),
    validation_steps=len(valid_generator),
    callbacks=[early_stopping],
    verbose=2,
)
pred_pl = model.predict_generator(test_generator.forfit(),
                                  steps=len(test_generator))
pred_pl = pred_pl[:, 1]

pred = res * 0.8 + pred_pl * 0.2
pred = (pred >= alpha).astype('int')

for i in range(len(test_data)):
    d = test_data[i]
    texta = d[0]
    textb = d[1]
    if Levenshtein.distance(texta, textb) == 0:
        pred[i] = 1

test_df['label'] = pred
sub = test_df[['id', 'label']]
sub.to_csv(("../prediction_result/result_" +
            datetime.datetime.now().strftime('%Y%m%d_%H%M%S') + ".csv"),
           header=None,
           index=False)
Example #47
0
def Edit_distance_str(str1, str2):
    import Levenshtein
    edit_distance_distance = Levenshtein.distance(str1, str2)
    similarity = 1-(edit_distance_distance/max(len(str1), len(str2)))
    return {'Distance': edit_distance_distance, 'Similarity': similarity}
[4] generate strategy
[1-9] restudy test
generate test
[4] intervention test
[4] assessment test

[x] for each item, calculate Levenshtein distance from the correct answer
[x] for each item, create a boolean variable for different Levenshtein cutoff thresholds (1,2,3,4)
[x] for each user, calculate scores (0-10) using different Levenshtein cutoff thresholds
[x] in r, plot each criteria against testScore, look for a step fn between 9 and 10 that is inclusive
"""
### Calculate lev distances

### Calculate interventionStrategyLevDistance
df_items['interventionStrategyLevDistance'] = df_items.apply(
    lambda row: lev.distance(row['itemEnglish'], row[
        'interventionStrategyUserInputRound1']),
    axis=1)

for n in range(1, 3):
    colname = "interventionStrategyLevDist" + str(n)
    df_items[colname] = df_items.apply(
        lambda row: 1 if row['interventionStrategyLevDistance'] <= n else 0,
        axis=1)

### Calculate interventionTestLevDistance
df_items['interventionTestLevDistance'] = df_items.apply(
    lambda row: lev.distance(row['itemEnglish'], row[
        'interventionTestUserInput']),
    axis=1)

for n in range(1, 3):
Example #49
0
def similarity(s,t):
    l_max = max(len(s),len(t))
    return round(1- float(Levenshtein.distance(s, t)/float(l_max)),2)
Example #50
0
def train_end2end(model, vocab, datasets, use_feat):
    print("END2END model training...")
    print("Features:", STF_MODEL)
    print("Save Model path:", STF_MODEL_PATH)
    print("WER path:", END2END_WER_PATH)

    optimizer = Adam(model.parameters(), lr=END2END_LR)
    loss_fn = nn.CTCLoss(zero_infinity=True)

    lr_scheduler = ReduceLROnPlateau(optimizer, factor=0.2, patience=4)

    best_wer = get_best_wer()
    curve = {"train": [], "val": []}

    current_best_wer = float("inf")
    trained = False
    # n_epochs since wer was updated
    since_wer_update = 0
    try:
        for epoch in range(1, END2END_N_EPOCHS + 1):
            print("Epoch", epoch)
            for phase in ["train", "val"]:
                if phase == "train":
                    model.train()  # Set model to training mode
                else:
                    model.eval()

                dataset = datasets[phase]
                n_batches = dataset.start_epoch()
                losses = []
                hypes = []
                gts = []

                with torch.set_grad_enabled(phase == "train"):
                    pp = ProgressPrinter(n_batches, 25 if USE_ST_FEAT else 1)
                    for i in range(n_batches):
                        optimizer.zero_grad()
                        X_batch, Y_batch, Y_lens = dataset.get_batch(i)
                        X_batch = X_batch.to(DEVICE)
                        Y_batch = Y_batch.to(DEVICE)

                        preds = model(X_batch).log_softmax(dim=2)
                        T, N, V = preds.shape
                        X_lens = torch.full(size=(N,), fill_value=T, dtype=torch.int32)
                        loss = loss_fn(preds, Y_batch, X_lens, Y_lens)
                        losses.append(loss.item())

                        if phase == "train":
                            loss.backward()
                            optimizer.step()

                        out_sentences = predict_glosses(preds, decoder=None)
                        gts += [y for y in Y_batch.view(-1).tolist() if y != 0]

                        for sentence in out_sentences:
                            hypes += sentence

                        if i == 0 and SHOW_EXAMPLE:
                            pred = " ".join(vocab.decode(out_sentences[0]))
                            gt = Y_batch[0][:Y_lens[0]].tolist()
                            gt = " ".join(vocab.decode(gt))
                            print("   ", phase, 'Ex. [' + pred + ']', '[' + gt + ']')

                        if SHOW_PROGRESS:
                            pp.show(i, "    ")

                    if SHOW_PROGRESS:
                        pp.end("    ")

                hypes = "".join([chr(x) for x in hypes])
                gts = "".join([chr(x) for x in gts])
                phase_wer = Lev.distance(hypes, gts) / len(gts) * 100

                if phase == "train":
                    lr_scheduler.step(phase_wer)

                curve[phase].append(phase_wer)
                phase_loss = np.mean(losses)
                print("   ", phase.upper(), "WER:", phase_wer, "Loss:", phase_loss)

                if phase_wer < best_wer[phase]:
                    best_wer[phase] = phase_wer
                    save_end2end_model(model, phase, best_wer[phase])

                if phase == "val":
                    if phase_wer < current_best_wer:
                        current_best_wer = phase_wer
                        since_wer_update = 0
                    else:
                        since_wer_update += 1

                    if since_wer_update >= END2END_STOP_LIMIT and not use_feat:
                        trained = True
                        raise KeyboardInterrupt

    except KeyboardInterrupt:
        pass

    if epoch >= END2END_N_EPOCHS:
        trained = True

    with open(os.path.join(VARS_DIR, "curve.pkl"), 'wb') as f:
        pickle.dump(curve, f)

    return best_wer, trained
Example #51
0
 def checkKNF_list(self, knf_list):
     dist = [lv.distance(self.cleaned_company_name, x) for x in knf_list]
     l1, l2 = zip(*sorted(zip(dist, knf_list), reverse=False))
     return l1, l2
Example #52
0
import Levenshtein
import os
import requests
import requests_cache

requests_cache.install_cache('../cache')
url = 'https://adventofcode.com/' + os.path.abspath(__file__).split(
    '/')[-2] + '/day/' + __file__.split('.')[0] + '/input'
s = requests.get(url, cookies={"session": os.environ['SESSION']}).text.strip()

two = 0
three = 0

for line in s.splitlines():
    twos = False
    threes = False
    for char in range(ord('a'), ord('z') + 1):
        if line.count(chr(char)) == 2:
            twos = True
        if line.count(chr(char)) == 3:
            threes = True
    two += int(twos)
    three += int(threes)
    for otherLine in s.splitlines():
        if Levenshtein.distance(line, otherLine) == 1:
            print(line)

print(two * three)
Example #53
0
def leven_comparison(actual, result):
    actual_formatted = re.sub(r"[^\w]", "", actual)
    result_formatted = re.sub(r"[^\w]", "", result)
    if (actual_formatted == result_formatted):
        return 0  #strings are the same
    return Levenshtein.distance(actual_formatted, result_formatted)
Example #54
0
     expression=lambda word1, word2, analyzer: abs(len(word1) - len(word2)),
     lang={"russian", "english"},
     order=5),
 "length difference norm by max":
 Feature(description="""
     Модуль разности между длинами слов разделенный на максимум из длин
     """,
         expression=lambda word1, word2, analyzer: abs(
             len(word1) - len(word2)) / max(len(word1), len(word2)),
         lang={"russian", "english"},
         order=6),
 "Levenshtein difference":
 Feature(description="""
     Редакционное расстояние между словами
     """,
         expression=lambda word1, word2, analyzer: Levenshtein.distance(
             word1, word2),
         lang={"russian", "english"},
         order=7),
 "Levenshtein difference (lemmas)":
 Feature(description="""
     Редакционное расстояние между словырными формами
     """,
         expression=lambda word1, word2, analyzer: Levenshtein.distance(
             analyzer.parse(word1)[0].normal_form,
             analyzer.parse(word2)[0].normal_form),
         lang={"russian"},
         order=8),
 "Levenshtein difference (lemmas) norm by length sum":
 Feature(description="""
     Редакционное расстояние между словарными формами
     """,
Example #55
0
logging.getLogger().setLevel(logging.INFO)

api_key = os.getenv("ICUBAM_API_KEY")
icubam_host = os.getenv("ICUBAM_HOST")

france_departments = set(
  list(icubam.predicu.data.load_france_departments().departmentName.unique())
)
bedcounts = icubam.predicu.data.load_icubam(
  api_key=api_key, icubam_host=icubam_host, clean=False
)
icubam_departments = set(list(bedcounts.icu_dept.unique()))

suspects = icubam_departments - france_departments
candidates = france_departments - icubam_departments

fixes = dict()
for suspect in suspects:
  best_candidate = None
  best_candidate_dist = int(1e6)
  for candidate in candidates:
    dist = levenshtein.distance(suspect, candidate)
    if dist < best_candidate_dist:
      best_candidate = candidate
      best_candidate_dist = dist
  fixes[suspect] = best_candidate
  print(suspect, '->', best_candidate, '( dist =', best_candidate_dist, ')')

with open('icubam/predicu/data/icubam_department_typo_fixes.json', 'w') as f:
  json.dump(fixes, f)
Example #56
0
def main():

    rows = []
    rows.append(["新編日本古典文学全集・テキスト", "KuroNet翻刻", "KuroNet翻刻(前行を含む)", "巻", "ID"])

    line = "磯づたひせず とてはしたなかめりとや"
    # line = "とてはしたなかめりとや"
    # line = "く中〱なりとおほすとさま"
    # line = "いそづたひせずとてはしたなかめりとや"
    # line = "よるべなみ風のさわがす舟人も思はぬかたに磯づた"

    text = convert_line(line, "")

    print(text)

    map = {}
    map[line] = []

    # print(configs)

    count = 0

    for vol in configs:

        config = configs[vol]

        print(config)

        # koui = config["data"]

        VOL = str(vol).zfill(2)

        
        if VOL != "35" and False:
            continue
        

        print(VOL)

        path = '../../docs/iiif/kuronet/'+VOL+'.json'

        if not os.path.exists(path):
            continue

        with open(path) as f:
            df = json.load(f)
            members = df["selections"][0]["members"]



        ################## マッチング

        

        indexedObj = {}

        prev_line = ""
        

        for i in range(len(members)):

            member = members[i]

            prev_label = ""

            # -1行
            if i - 1 >= 0:
                prev_label = members[i-1]["label"]
            
            label, text2 = convert_ocr(members[i]["label"], prev_label)

            print(VOL, i, text, text2)
            
            score = Levenshtein.distance(text, text2)
            score = score / max(len(text), len(text2)) # 正規化

            obj = {
                "label" : label,
                "main" : member["label"],
                "score" : score,
                "member_id" : member["@id"],
                "index" : i,
                "vs" : "【新編】" + text + "---【OCR】" + text2,
                "vol" : VOL
            }

            map[line].append (obj)

            indexedObj[count] = obj

            count += 1


    ################## 集計

    size = len(map)

    count = 0

    # 校異のライン毎に
    for line in map:

        count += 1

        print(count, size)

        # print(str(koui[line])+"\t"+line)

        obj = map[line]

        # スコアが小さい順に並び替え
        score_sorted = sorted(obj, key=lambda x:x["score"])

        flg = True

        for i in range(len(score_sorted)):

            data = score_sorted[i]

            if i < 25:
                print(data)

                row = [line, data["main"], data["label"], data["vol"], data["member_id"]]
                rows.append(row)

    df = pd.DataFrame(rows)

    df.to_excel('data/check.xlsx',index=False, header=False)
Example #57
0
barcodes = set(barcodes)

with pysam.FastxFile(args.read1) as fh, pysam.FastxFile(args.read2) as fh2:
    
    n = 0
    y = 0
    for record_fh, record_fh2  in zip(fh, fh2):
        barcode = record_fh.sequence[0:24]

        y += 1
        barcode_list = {}
        for b in barcodes:
            

            if Levenshtein.distance(barcode, b) <= int(args.distance):
                n +=1
                
                b = b + record_fh.sequence[24:]
                barcode_list[b] = Levenshtein.distance(barcode, b)
    
            else:
                pass

        if bool(barcode_list):
            b = min(barcode_list, key=barcode_list.get)

            res = True
            test_value = list(barcode_list.values())[0]

            for ele in barcode_list:
Example #58
0
        # print "-->", median,
        if median == last_median:
            break
    return median 

while True:

    print 
    target      = raw_input('Enter a target (ground-truth) word : ').strip()
    predictions = raw_input('Enter some predicted  words        : ').strip().split()
    print

    # get distance for each predicted word, as well as their average
    dist_tot = 0     
    for word in predictions: 
        dist = lev.distance(target, word)
        print "%4i = edit_distance(%s, %s)" %  (dist, target, word)
        dist_tot += dist
    dist_avg_pred = 1.*dist_tot/len(predictions)
    print
    print "Average edit_distance between each predicted word and the target word",
    print [target], "is:", dist_avg_pred

    # get the distance for the median word  (to compare)
    weights= len(predictions) * [1,]
    median = get_median(predictions, weights)
    print "\nMedian string of", predictions, "is", [median]
    dist_median    = 1.*lev.distance(target, median)
    print "Edit_distance from the median string", [median],
    print " to the target word ", [target], "is:", dist_median
    print "\n-----"
Example #59
0
def silhouette(Dicofasta, Dicocentroid, Dicoresult, DicoNeighbour,
               len_max_CDR3, len_max_J, dico_vjunc):
    summe = 0
    #for cluster in tqdm.tqdm(Dicoresult.keys()) :
    for cluster in Dicoresult.keys():
        #print ("cluster",cluster)
        for seq in Dicoresult[cluster]:
            #print(Dicofasta[seq])
            dist_intra = 0
            dist_neighb = {}
            for seq_same_clust in Dicoresult[cluster]:
                if seq_same_clust != seq:
                    V_component = 0
                    # same V, without considering allel
                    if Dicofasta[seq][0].split("*")[0] != Dicofasta[
                            seq_same_clust][0].split("*")[0]:
                        V_component += 1
                    # normalize the distance depending on the longest seq in the whole repertoire.
                    CDR_component = float(
                        Levenshtein.distance(
                            Dicofasta[seq][2],
                            Dicofasta[seq_same_clust][2])) / len_max_CDR3
                    J_component = (100 - get_similarity_score(
                        Dicofasta[seq][3], Dicofasta[seq_same_clust][3],
                        len(Dicofasta[seq][3]),
                        len(Dicofasta[seq_same_clust][3]))) / float(len_max_J)
                    #print ('J_component',J_component)
                    dist_intra += CDR_component + V_component + J_component / 3.0
            if len(Dicoresult[cluster]) != 1:
                ai = float(dist_intra) / (len(Dicoresult[cluster]) - 1)
            else:
                ai = 0
            for seq_neighb in Dicoresult[DicoNeighbour[cluster]]:
                V_component_b = 0
                CDR_component_b = 0
                J_component_b = 0
                # same V, without considering allel
                if Dicofasta[seq][0].split(
                        "*")[0] != Dicofasta[seq_neighb][0].split("*")[0]:
                    V_component_b += 1
                CDR_component_b = Levenshtein.distance(
                    Dicofasta[seq][1],
                    Dicofasta[seq_neighb][1]) / float(len_max_CDR3)
                J_component_b = (100 - get_similarity_score(
                    Dicofasta[seq][3], Dicofasta[seq_neighb][3],
                    len(Dicofasta[seq][3]), len(
                        Dicofasta[seq_neighb][3]))) / float(len_max_J)
                #print('V_component_b',V_component_b,'CDR_component_b',CDR_component_b,'J_component_b',J_component_b)
                dist_neighb[
                    seq_neighb] = CDR_component_b + V_component_b + J_component_b / 3.0
            #print ('dist_neigh',dist_neighb)
            bi = min(dist_neighb.values())
            #print ("================")
            #print (seq)
            #print ("ai = ",ai )
            if bi < ai:
                #print ("lalalalaala",ai,bi)
                #print("disonnnnn,Dicoresult",)

                to_move = (list(dist_neighb.keys())[list(
                    dist_neighb.values()).index(bi)])
                #print("tomove",to_move)
                Dicoresult[DicoNeighbour[cluster]].remove(to_move)
                if len(Dicoresult[DicoNeighbour[cluster]]) == 0:
                    del Dicoresult[DicoNeighbour[cluster]]
                Dicoresult[cluster].append(to_move)
                #print(Dicoresult)
                Dicocentroid = CalculateMedoid(dico_vjunc, Dicoresult)
                DicoNeighbour = Creat_dico_neighbour(Dicocentroid)
                #silhouette(Dicofasta,Dicocentroid,Dicoresult,DicoNeighbour, len_max_CDR3,len_max_J,dico_vjunc)

            return (Dicoresult, DicoNeighbour, Dicocentroid)
Example #60
0
outf2 = iotools.open_file(
    "corrected_reads.dir/" + args.outname + "_corrected.fastq.2.gz", "w")
log = iotools.open_file("corrected_reads.dir/" + args.outname + ".log", "w")

with pysam.FastxFile(args.read1) as fh, pysam.FastxFile(args.read2) as fh2:

    n = 0
    y = 0
    for record_fh, record_fh2 in zip(fh, fh2):
        barcode = record_fh.sequence[0:24]

        y += 1

        for b in barcodes:

            if Levenshtein.distance(barcode, b) <= int(args.distance):
                n += 1
                b = b + record_fh.sequence[24:]

                outf.write("@%s\n%s\n+\n%s\n" %
                           (record_fh.name, b[::2], record_fh.quality[::2]))
                outf2.write(
                    "@%s\n%s\n+\n%s\n" %
                    (record_fh2.name, record_fh2.sequence, record_fh2.quality))
                break

            else:
                pass

log.write("The number of total reads is: %s\n" % (y))
log.write("The number of total recovered reads is: %s\n" % (n))