コード例 #1
0
def song_comparison(s, t):
    s, t = s.split(), t.split()

    similarity = levenshtein(' '.join(s), ' '.join(t))

    n, m = len(s), len(t)

    if n > m:
        return similarity

    if n > 2:
        #print(s, t)
        arr = [[10**9 for i in range(m)] for j in range(n)]
        #print(arr)
        for i, u in enumerate(s):
            for j, v in enumerate(t):
                #print(i, j)
                arr[i][j] = levenshtein(u, v)

        #print(arr)
        return min_weight_max_matching(n, m, arr)

    d = {(u, v): levenshtein(u, v) for u in s for v in t}
    for p in permutations(t, n):
        #print(s, p)
        similarity = min(similarity, sum(d[(u, v)] for (u, v) in zip(s, p)))

    return similarity
コード例 #2
0
ファイル: test_levenshtein.py プロジェクト: Answeror/lit
def check_first_insertion(query, good, bad):
    args = dict(
        deletion_cost=100,
        insertion_cost=1,
        first_insertion_cost=50,
        prepend_first_insertion_cost=5,
        append_first_insertion_cost=10,
        substitution_cost=100,
    )
    assert_less(
        levenshtein(query, good, memo=[], precol=[], **args), levenshtein(query, bad, memo=[], precol=[], **args)
    )
コード例 #3
0
ファイル: utils.py プロジェクト: kazarin1alex/lit
 def update(self, old, new):
     self.memo = self.memo[:_common_prefix_length(old, new) + 1]
     self.precol = [] if self.parent is None else [x[-1] for x in self.parent.memo]
     levenshtein(
         new,
         self.text,
         deletion_cost=self.tree.deletion_cost,
         insertion_cost=self.tree.insertion_cost,
         substitution_cost=self.tree.substitution_cost,
         transposition_cost=self.tree.transposition_cost,
         memo=self.memo,
         precol=self.precol
     )
コード例 #4
0
def compareDataset(in1, in2, path):
    d1 = in1.get(path)
    d2 = in2.get(path)
    if toIgnore(d1):
#        print('Ignoring ' + path)
        return None
    if d2 is None:
#        print('Second misses path: ' +path)
        return 'Missing'
    diff = testSpecial(d1,d2,path)
    if diff != None:
        return diff
    if  d1.dtype.type is np.string_:
        if d1[0] != d2[0]:
            return levenshtein(d1[0],d2[0])
        else:
            return None
    elif d1.dtype.kind ==  'f':
        diff = abs((d1[0] - d2[0]))
        if diff > .1:
            return diff
        else:
            return None
    else:
#       print('Integer DS: ' + path)
        diff =  abs(d1[0] - d2[0])
        if diff > 0:
            return diff
        else:
            return None
コード例 #5
0
ファイル: Top53.py プロジェクト: thiamshui/gigglesearch
def find5(word,C):
        #word = request.args.get('q')
	minimum = len(word)
	top5 = ["","","","",""]
	edit_dist5 = [minimum]*5
	

	for c in C:
		edit_dist = levenshtein.levenshtein(word,c)
		#print(edit_dist)
		if edit_dist > edit_dist5[4]:
			continue
		elif edit_dist == 0:
			continue
		elif edit_dist < edit_dist5[0]:
			edit_dist5= [edit_dist] + edit_dist5[:4]
			top5= [c] + top5[:4]
		elif edit_dist < edit_dist5[1]:
			edit_dist5= [edit_dist5[0]] + [edit_dist] + edit_dist5[1:4]
			top5= [top5[0]] + [c] + top5[1:4]
		elif edit_dist < edit_dist5[2]:
			edit_dist5= edit_dist5[:2] + [edit_dist] + edit_dist5[2:4]
			top5= top5[:2] + [c] + top5[2:4]
		elif edit_dist < edit_dist5[3]:
			edit_dist5= edit_dist5[:3] + [edit_dist] + [edit_dist5[3]]
			top5= top5[:3] + [c] + [top5[3]]
		elif edit_dist < edit_dist5[4]:
			edit_dist5= edit_dist5[:4] + [edit_dist]
			top5= top5[:4] + [c]
        return top5
コード例 #6
0
ファイル: phonetics.py プロジェクト: nwinter/bantling
def spellability(name, test=False):
    """
    Return a 0-1 score representing how spellable this name is. 0 is bad.
    
    How? To start off, we look at all the names with the same Metaphone key,
    then assign a penalty depending on collisions with any common names.
    The size of the penalty will depend on how common the other name is relative
    to this name (so this is also somewhat of a popularity weighting), and
    the Levenshtein distance to any names with Metaphone collisions, since
    Metaphone ignores vowels and we do care about actual vowel and letter
    differences here.

    There might be more we could do here, but I can't think of it now.
    """
    score = 1
    for metaphone in name.metaphones:
        for other in _metaphone_index[metaphone]:
            if other is name: continue
            pop_ratio = (other.get_popularity(emphasize_recent=True) /
                         (name.get_popularity(emphasize_recent=True) or 0.000001))
            if pop_ratio < 0.01: continue  # levenshtein is expensive
            distance = levenshtein(other.name, name.name)
            penalty = math.log(1 + pop_ratio) / (distance ** 2)
            if test and name.name == "Eliza" and penalty > 0.1:
                print (name.name, "took a hit of", penalty, "from",
                       other.name, pop_ratio, distance)
            score *= max(0.5, (1 - penalty))
            if test: print "%s vs. %s: pop_ratio %.5f"%(other, name, pop_ratio)
    if test: print "%s got spellability score %.3f"%(name, score)
    return score
コード例 #7
0
def findPhonetics(subject, bookshelf):
    dictionary = open(bookshelf)
    try:
        subPhonetic = metaphone(subject)
    except TypeError:
        print("Soundex broke")
        return

    levenNumber = 0
    similar = []

    for line in dictionary:
        if "-" not in line:
            line = line.split("\n")[0]
            try:
                linePhonetic = metaphone(line)
                if line != subject and linePhonetic.find(subPhonetic) != -1:
                    subjectIndex = findPhoneticIndex(subject, line)
                    similar.append(
                        (line,
                         levenshtein(
                             line[subjectIndex:subjectIndex + len(subject)],
                             subject)))

            except TypeError:
                #print("Broke on " + line + "... Continuing")
                continue

    similar.sort(key=lambda tup: tup[1])
    return similar
コード例 #8
0
ファイル: sms.py プロジェクト: BoldBigflank/idovoices
def submitGuess(phoneNumber, guess, self):
    currentPlayers = User.User.gql("")
    # for each current person in the db
    logging.info("Guess is " + guess)
    for player in currentPlayers:
        logging.info("player number " + str(player.number))
        if player.number == phoneNumber or not player.word:
            continue
        distance = levenshtein(guess.lower(), player.word.lower())
        logging.info("distance: " + str(distance))
        if distance <= GUESS_THRESHOLD:
            r = twilio.Response()
            # Award points
            guesserScore = addPoints(phoneNumber, CORRECT_GUESS)

            guesserMessage = "You guessed " + player.word + " correct! Your score is now " + str(guesserScore) + "."
            if guesserScore >= MAX_SCORE:
                guesserMessage += " You win!"
            r.addSms(guesserMessage, to=phoneNumber)

            guessedScore = addPoints(player.number, NAME_GUESSED)
            guessedMessage = "You were guessed!  Score: " + str(guessedScore) + "."
            if guessedScore >= MAX_SCORE:
                guessedMessage += " You win!"
            else:
                guessedMessage += " Next: " + assignNextName(player.number)
            r.addSms(guessedMessage, to=player.number)
            self.response.out.write(r)
            if guesserScore >= MAX_SCORE or guessedScore >= MAX_SCORE:
                resetGame()
            return player.word
コード例 #9
0
ファイル: Top53.py プロジェクト: thiamshui/gigglesearch
def find5(word, C):
    #word = request.args.get('q')
    minimum = len(word)
    top5 = ["", "", "", "", ""]
    edit_dist5 = [minimum] * 5

    for c in C:
        edit_dist = levenshtein.levenshtein(word, c)
        #print(edit_dist)
        if edit_dist > edit_dist5[4]:
            continue
        elif edit_dist == 0:
            continue
        elif edit_dist < edit_dist5[0]:
            edit_dist5 = [edit_dist] + edit_dist5[:4]
            top5 = [c] + top5[:4]
        elif edit_dist < edit_dist5[1]:
            edit_dist5 = [edit_dist5[0]] + [edit_dist] + edit_dist5[1:4]
            top5 = [top5[0]] + [c] + top5[1:4]
        elif edit_dist < edit_dist5[2]:
            edit_dist5 = edit_dist5[:2] + [edit_dist] + edit_dist5[2:4]
            top5 = top5[:2] + [c] + top5[2:4]
        elif edit_dist < edit_dist5[3]:
            edit_dist5 = edit_dist5[:3] + [edit_dist] + [edit_dist5[3]]
            top5 = top5[:3] + [c] + [top5[3]]
        elif edit_dist < edit_dist5[4]:
            edit_dist5 = edit_dist5[:4] + [edit_dist]
            top5 = top5[:4] + [c]
    return top5
コード例 #10
0
def get_distance(variant1, variant2, distanceMatrix):
    if distanceMatrix.get(variant1, None) is not None:
        if distanceMatrix[variant1].get(variant2, None) is None:
            distanceMatrix[variant1][variant2] = levenshtein(
                variant1, variant2)
            if distanceMatrix.get(variant2, None) is None:
                distanceMatrix[variant2] = dict()
            distanceMatrix[variant2][variant1] = distanceMatrix[variant1][
                variant2]
    else:
        distanceMatrix[variant1] = dict()
        distanceMatrix[variant1][variant2] = levenshtein(variant1, variant2)
        if distanceMatrix.get(variant2, None) is None:
            distanceMatrix[variant2] = dict()
        distanceMatrix[variant2][variant1] = distanceMatrix[variant1][variant2]
    return distanceMatrix[variant1][variant2]
コード例 #11
0
def condition(movie, titles):
    if float(movie.rank) >= 8.0:
        if int(movie.votes) >= 1000:
            for needed in titles:
                if lnsh.levenshtein(movie.title, needed) <= 3:
                    #print('is less than 3')
                    return True
    return False
 def correct(self, incorrect):
     test_pairs = [(incorrect, candidate) for candidate in heapq.nsmallest(
         10,
         self.dictionary,
         key=lambda x: levenshtein.levenshtein(incorrect, x))]
     gx_test = self.fe.transform(test_pairs)
     # Pr is a list of probability, corresponding to each correction pair in test_pairs
     pr = self.m.predict_proba(gx_test)
     print pr
     cr = zip(pr, test_pairs)
     print cr
     # We use the one with largest probability as the correction of the incorrect word
     cr = max(cr, key=lambda x: x[0][0])
     if levenshtein.levenshtein(incorrect, cr[1][1]) > 2:
         return 'gopdebate'
     else:
         return cr[1][1]
 def test_mutar(self):
     sol = self.__tsp.generar()
     mutada = self.__tsp.mutar(sol)
     distancia = levenshtein(sol, mutada)
     # las soluciones no deben ser iguales
     self.assertGreater(distancia, 0)
     # las soluciones no deben variar en más del 50%
     self.assertLessEqual(distancia, ceil(len(sol) / 2))
コード例 #14
0
def condition(movie, titles):
    if float(movie.rank) >= 8.0:
        if int(movie.votes) >= 1000:
            for needed in titles:
                if lnsh.levenshtein(movie.title, needed) <= 3:
                    #print('is less than 3')
                    return True
    return False
コード例 #15
0
 def test_algorith(self):
     """Test the algorithm for some strings"""
     name = "abba"
     word_list = [ "abba", "acba", "abbba", "ackbar" ]
     answer_list = [0, 1, 1, 3]
     index = 0
     for word in word_list:
         self.assertEqual(answer_list[index],levenshtein.levenshtein(name, word))
         index += 1
コード例 #16
0
 def __getDistanceVariants(self, variant1, variant2):
     if variant1 not in self.__distanceMatrix:
         self.__distanceMatrix[variant1] = dict()
     if variant2 not in self.__distanceMatrix[variant1]:
         distance = levenshtein(variant1, variant2)
         self.__distanceMatrix[variant1][variant2] = distance
     else:
         distance = self.__distanceMatrix[variant1][variant2]
     return distance
コード例 #17
0
 def correct(self, incorrect):
     test_pairs = [(incorrect, candidate) for candidate in 
                   heapq.nsmallest(100, self.dictionary, key=lambda x: levenshtein.levenshtein(incorrect, x))]
     gx_test = self.fe.transform(test_pairs)
     # Pr is a list of probability, corresponding to each correction pair in test_pairs 
     pr = self.m.predict_proba(gx_test)
     cr = zip(pr, test_pairs)
     # We use the one with largest probability as the correction of the incorrect word
     cr = max(cr, key=lambda x: x[0][0])
     return cr[1][1]
コード例 #18
0
ファイル: core.py プロジェクト: pkozyulina/bioinf_scripts
def build_alignment(input_file, ref_file_norm, ref_file_mut, quality):
    # here we will have a dict with gene names and how many reads aligns to norm and mutant form
    gene_list = collections.defaultdict(list)
    ref_norm_dict = parse_reference(ref_file_norm)
    ref_mut_dict = parse_reference(ref_file_mut)
    for seq in parse_fastq(input_file, quality):
        min_dist = 1000
        ref_min = ""
        seq_small = seq
        #print('SEQ %s' % (seq))
        for ref_norm in ref_norm_dict:
            dist = levenshtein(ref_norm_dict[ref_norm], seq_small)
            #print(dist, ref_norm)
            if dist < min_dist:
                min_dist = dist
                ref_min = ref_norm
        mut_dist = levenshtein(ref_mut_dict[ref_min], seq_small)
        gene_list[ref_min] = [min_dist, mut_dist]
    print(gene_list)
コード例 #19
0
def stringCheck(x):
	whitelistRaw = open('whitelist.csv','r')
	whitelist = whitelistRaw.readlines()
	for w in range(len(whitelist)):
		whitelist[w] = whitelist[w].replace('\n','')
	for w in whitelist:
		if levenshtein(w, x) <= 1 and len(x) >= 6: # allows for minor misspellings of innocuous word, but only in longer words
			# print 'variant of ' + w
			return False
	return True
コード例 #20
0
ファイル: correct.py プロジェクト: Zotek/PJN
def correct(dictionary, word):
    lines = [line for line in codecs.open(dictionary, encoding="utf-8")]
    lines_map = {}
    i = 0
    for line in lines:
        lines_map[line] = levenshtein(line, word)
        i += 1
        if i % 10000 == 0: print(i * 100) / len(lines), "%"
    for line, score in nsmallest(5, lines_map.items(), key=itemgetter(1)):
        print line, " : ", score
コード例 #21
0
def levenshtein_kmers(kmerfile, bh_pvalue_cutoff, outfile):
    enrichments = {}
    levenshteins = {}
    kmers = []

    kmersfh = open(kmerfile, 'r')

    for line in kmersfh:
        line = line.strip().split('\t')
        kmer = line[0]
        enrichment = line[3]
        bh_pvalue = line[5]

        #Define case by enrichment (greater than or less than 1)
        if kmer != 'kmer':
            if float(bh_pvalue) <= float(bh_pvalue_cutoff):
                if enrichment == 'NA':
                    kmers.append(kmer.upper())
                    enrichments[kmer] = enrichment
                elif float(enrichment) > 1:
                    kmers.append(kmer.upper())
                    enrichments[kmer] = enrichment
                elif float(enrichment) < 1:
                    kmers.append(kmer.lower())
                    enrichments[kmer] = enrichment

    kmersfh.close()

    #Compare every kmer to every other kmer.  Make a list of distances for every kmer.
    #Make sure kmers are converted to upper case so distances do not include case.
    #Put that list in a dictionary with kmer as key
    for kmer in kmers:
        levenshtein_list = []
        for KMER in kmers:
            levenshtein_list.append(
                levenshtein.levenshtein(kmer.upper(), KMER.upper()))

        levenshteins[kmer] = levenshtein_list

    #Turn these lists into an array
    levenshtein_array = np.array(levenshteins[kmers[0]])
    for idx, kmer in enumerate(kmers):
        if idx > 0:
            levenshtein_array = np.vstack(
                [levenshtein_array, levenshteins[kmer]])

    print levenshtein_array
    #Output array
    np.savetxt(outfile, levenshtein_array, delimiter='\t', fmt='%2f')

    #Put kmer list (in order) at top of output
    with open(outfile, 'r+') as f:
        old = f.read()
        f.seek(0)
        f.write(('\t').join(kmers) + '\n' + old)
コード例 #22
0
    def get_most_likely_individuals(self, query, distance=3):
        individuals = self.get_individuals_for_levenshtein()
        out = []

        for element in individuals:
            calculated_distance = levenshtein(query, element)
            if calculated_distance <= distance:
                out.append((calculated_distance, element))

        # remove dublettes while preserving list ordering
        out = [element[1] for element in list(dict.fromkeys(sorted(out)))]
        return out
コード例 #23
0
ファイル: hw1_dhaval.py プロジェクト: sanskrit-lexicon/VCP
def glueto(hw):
	global counter1, counter2
	hw = hw.replace(' ','')
	hw = hw.replace('*','')
	m = re.search('(.*)[(](.+)[)](.*)',hw)
	pre, mid, post = re.sub('[^a-zA-Z]','',m.group(1)), re.sub('[^a-zA-Z]','',m.group(2)), re.sub('[^a-zA-Z]','',m.group(3))
	# decide the place to change
	prelev = levenshtein.levenshtein(pre[-len(mid):],mid)
	postlev = levenshtein.levenshtein(post[:len(mid)],mid)
	out = hw+":404"
	if re.search('.{1}[(].{1}[)]',hw): # a(A)nEpuRa
		#print hw, "5"
		out = mid+post+":5"
	elif re.search('abBr.{1}[(]Br.{1}[)]',hw): # abBra(Bra)puzpa, abBro(Bro)tTa
		#print hw, "6"
		out = "a"+mid+post+":6"
	elif re.search('UrdDa[(]rdDva[)]',hw): # abBra(Bra)puzpa, abBro(Bro)tTa
		#print hw, "7"
		out = "UrdDva"+post+":7"
	elif prelev < postlev:
		if pre[-len(mid):].startswith(mid[0]) and len(pre)>=len(mid) and not pre[-len(mid):]==mid:
			#print hw, "1"
			out = pre[:-len(mid)]+mid+post+":1"
		elif pre[-len(mid):].endswith(mid[-1]) and len(pre)>=len(mid) and not pre[-len(mid):]==mid:
			#print hw, "2"
			out = pre[:-len(mid)]+mid+post+":2"
	elif postlev < prelev:
		if post[:len(mid)].startswith(mid[0]) and len(post)>=len(mid) and not post[:len(mid)]==mid:
			#print hw, "3"
			out = pre+mid+post[len(mid):]+":3"
		elif post[:len(mid)].endswith(mid[-1]) and len(post)>=len(mid) and not post[:len(mid)]==mid:
			#print hw, "4"
			out = pre+mid+post[len(mid):]+":4"
	elif alphabetdistance.distancescore(pre[:-len(mid)],mid) < alphabetdistance.distancescore(post[:len(mid)],mid):
		#print hw, "8"
		out = pre[:-len(mid)]+mid+post+":8"
	elif alphabetdistance.distancescore(pre[:-len(mid)],mid) > alphabetdistance.distancescore(post[:len(mid)],mid):
		#print hw, "9"
		out = pre+mid+post[len(mid):]+":9"
	return out
コード例 #24
0
def compare(word1, word2):
    word1 = word1.strip().lower().replace(u' ', u'')
    word2 = word2.strip().lower().replace(u' ', u'')
    word1_len = len(word1)
    word2_len = len(word2)
    lev = levenshtein(word1, word2)
    if lev == 0:
        return 1.0
    else:
        if word1_len > word2_len:
            return (float(lev) / word1_len) * 1 / (float(lev) / word2_len)
        else:
            return (float(lev) / word2_len) * 1 / (float(lev) / word1_len)
コード例 #25
0
def findByORC(wordToFix, lexicon):
    sameORC = [
        seq[1][4] + seq[1][7:11] for seq in lexicon
        if orcDecode.extractORC(seq[1]) == orcDecode.extractORC(wordToFix)
    ]
    print sameORC
    word = orcDecode.extractword(wordToFix)
    levDist = [levenshtein.levenshtein(word, possible) for possible in sameORC]
    print levDist
    levPerPossible = min(zip(sameORC, levDist), key=lambda x: x[1])
    print levPerPossible
    return wordToFix[:4] + levPerPossible[0][0] + 'TT' + levPerPossible[0][
        1:] + wordToFix[11:]
コード例 #26
0
ファイル: views.py プロジェクト: isergey/libcms
def compare(word1, word2):
    word1 = word1.strip().lower().replace(u' ', u'')
    word2 = word2.strip().lower().replace(u' ', u'')
    word1_len = len(word1)
    word2_len = len(word2)
    lev = levenshtein(word1, word2)
    if lev == 0:
        return 1.0
    else:
        if word1_len > word2_len:
            return (float(lev) / word1_len) * 1 / (float(lev) / word2_len)
        else:
            return (float(lev) / word2_len) * 1 / (float(lev) / word1_len)
コード例 #27
0
ファイル: analyse.py プロジェクト: frabcus/judgmental
def best_filename(year, abbreviated_court, court_url, citations):
    """Choose the best name for this judgment from the available citations. Is a generator, returning alternative versions."""

    dummy_citation = "[%d] %s " % (year, abbreviated_court)
    
    (distance, name) = min((levenshtein.levenshtein(dummy_citation, s, deletion_cost=2,substitution_cost=2), s) for s in citations)
  
    basic_name = str(year)+"/"+name.replace(' ','_').replace('/','__')

    yield os.path.join(court_url,basic_name + ".html")
    for c in range(1,100):
        yield os.path.join(court_url, basic_name + "_%d"%c + ".html")
    raise StandardConversionError("something's going wrong: we can't give this a filename")
コード例 #28
0
def main():
    subject = sys.argv[1]
    dictionary = open("/usr/share/dict/web2")
    levenNumber = 2

    similar = []

    for line in dictionary:
        line = line.split("\n")[0]
        if subject != line and subject[0] == line[0] and levenshtein(
                subject, line) <= levenNumber:
            similar.append(line)

    print(similar)
コード例 #29
0
ファイル: utils.py プロジェクト: kazarin1alex/lit
 def distance_to(self, text):
     return levenshtein(
         self.text,
         text,
         deletion_cost=self.deletion_cost,
         insertion_cost=self.insertion_cost,
         first_insertion_cost=self.first_insertion_cost,
         prepend_first_insertion_cost=self.prepend_first_insertion_cost,
         append_first_insertion_cost=self.append_first_insertion_cost,
         substitution_cost=self.substitution_cost,
         transposition_cost=self.transposition_cost,
         memo=self.memo,
         precol=self.precol
     )
コード例 #30
0
 def __combineTracesAndTree(self, traces):
     #We transform the set of sequences into a list and sort it, to discretize the behaviour of the algorithm
     sequencesTree = list(self.__getAllPotentialSequencesTree(self.tree,""))
     sequencesTree.sort()
     for trace in traces:
         bestSequence = ""
         lowestDistance = sys.maxsize
         traceSequence = self.traceToSequenceDict[trace]
         for treeSequence in sequencesTree:
             currentDistance = levenshtein(traceSequence, treeSequence)
             if currentDistance < lowestDistance:
                 bestSequence = treeSequence
                 lowestDistance = currentDistance
         self.__addCaseToTree(trace, bestSequence)
コード例 #31
0
ファイル: app.py プロジェクト: mpeddle/jubilant-pancake
def edit_distance():
    """
    Calculates the edit distance between two strings
    via JSON POST
    """
    try:
        req_json = request.get_json()
        a, b = req_json['a'], req_json['b']
    except KeyError:
        return api_error("Please enter two values")
    data = {
        'result': levenshtein(a, b)
    }
    return api_success(data)
コード例 #32
0
 def calculate_trace(s1, s2, model):
     edits, trace_matrix = levenshtein(s1, s2, model)[1:]
     for i in reversed(range(len(trace_matrix))):
         for j in reversed(range(len(trace_matrix[i]))):
             try:
                 pass
                 if trace_matrix[i][j] and trace_matrix[i + 1][
                         j +
                         1] and trace_matrix[i][j - 1] and not trace_matrix[
                             i + 1][j] and not trace_matrix[i - 1][j]:
                     trace_matrix[i + 1][j] = True
                     trace_matrix[i][j] = False
             except KeyError:
                 pass
     return Trace(s1, s2, trace_matrix)
コード例 #33
0
ファイル: did_you_mean.py プロジェクト: jstep/did_you_mean
def closest_match(input_word: str, words: Set) -> str:
    real_word(input_word)

    print("Calculating edit distance...")

    # Compute the edit distance between the input word and all words.
    ld_values_list = [levenshtein(input_word, w) for w in words]

    # Merge the computed edit values list with the words list.
    ld_dict = dict(zip(ld_values_list, words))

    # Get the min value of the edit values list.
    min_key = min(ld_dict.keys())

    return ld_dict.get(min_key)
コード例 #34
0
ファイル: kaomoji.py プロジェクト: jiyiiy/kaomoji-pilchard
    def default(self, name):
        name = name.decode('utf-8')
        map_kaomoji[name]

        similar_kaomoji = sorted(
            (levenshtein(name, i), abs(len(i)-len(name)), i)
            for i in kaomoji_guess_list
        )[:100]
        similar_kaomoji = [i[-1] for i in similar_kaomoji]

        return self._template('kaomoji/kaomoji_item.html', {
            'cur_kaomoji': name,
            'kaomoji_list': map_kaomoji[name],
            'similar_kaomoji': similar_kaomoji
        })
コード例 #35
0
ファイル: tests.py プロジェクト: mpeddle/jubilant-pancake
 def test_values(self):
     """
     Tests for levenshtein.
     """
     values = [
         ['Nebraska', 'Bill Brasky', 7],
         ['aa', '', 2],
         ['', 'aa', 2],
         ['AA', 'Aa', 1],
         ['ab', 'Aa', 2],
         ['aa', 'ab', 1],
         ['a', 'abc', 2],
     ]
     for a, b, expected in values:
         distance = levenshtein(a, b)
         self.assertEqual(distance, expected)
コード例 #36
0
 def test_values(self):
     """
     Tests for levenshtein.
     """
     values = [
         ['Nebraska', 'Bill Brasky', 7],
         ['aa', '', 2],
         ['', 'aa', 2],
         ['AA', 'Aa', 1],
         ['ab', 'Aa', 2],
         ['aa', 'ab', 1],
         ['a', 'abc', 2],
     ]
     for a, b, expected in values:
         distance = levenshtein(a, b)
         self.assertEqual(distance, expected)
コード例 #37
0
    def get_features_tgt(self, target, parallelsentence):
        """
        Calculates Levenshtein distance for the given target sentence, against the reference sentence
        @param simplesentence: The target sentence to be scored
        @type simplesentence: sentence.sentence.SimpleSentence
        @rtype: dict
        @return: dictionary containing Levenshtein distance as an attribute 
        """
        target_untokenized = target.get_string()
        try:
            ref_untokenized = parallelsentence.get_reference().get_string()
        

            wer_value = levenshtein(target_untokenized, ref_untokenized)
            return {'ref-lev': str(wer_value)}
        except:
            return {}
コード例 #38
0
ファイル: Top52.py プロジェクト: thiamshui/gigglesearch
def find10(word,C):
	minimum = len(word)
	top10 = ["","","","","","","","","",""]
	edit_dist10 = [minimum]*10
	

	for c in C:
		edit_dist = levenshtein.levenshtein(word,c)
		#print(edit_dist)
		if edit_dist > edit_dist10[9]:
			continue
		elif edit_dist == 0:
			continue
		elif edit_dist < edit_dist10[0]:
			edit_dist10= [edit_dist] + edit_dist10[:9]
			top10= [c] + top10[:9]
		elif edit_dist < edit_dist10[1]:
			edit_dist10= [edit_dist10[0]] + [edit_dist] + edit_dist10[1:9]
			top10= [top10[0]] + [c] + top10[1:9]
		elif edit_dist < edit_dist10[2]:
			edit_dist10= edit_dist10[:2] + [edit_dist] + edit_dist10[2:9]
			top10= top10[:2] + [c] + top10[2:9]
		elif edit_dist < edit_dist10[3]:
			edit_dist10= edit_dist10[:3] + [edit_dist] + edit_dist10[3:9]
			top10= top10[:3] + [c] + top10[3:9]
		elif edit_dist < edit_dist10[4]:
			edit_dist5= edit_dist10[:4] + [edit_dist] + top10[4:9]
			top10= top10[:4] + [c] + top10[4:9]
		elif edit_dist < edit_dist10[5]:
			edit_dist10= edit_dist10[:5] + [edit_dist] + edit_dist10[5:9]
			top10= top10[:5] + [c] + top10[5:9]
		elif edit_dist < edit_dist10[6]:
			edit_dist10= edit_dist10[:6] + [edit_dist] + edit_dist10[6:9]
			top10= top10[:6] + [c] + top10[6:9]
		elif edit_dist < edit_dist10[7]:
			edit_dist10= edit_dist10[:7] + [edit_dist] + edit_dist10[7:9]
			top10= top10[:7] + [c] + top10[7:9]
		elif edit_dist < edit_dist10[8]:
			edit_dist10= edit_dist10[:8] + [edit_dist] + edit_dist10[8:9]
			top10= top10[:8] + [c] + top10[8:9]
		elif edit_dist < edit_dist10[9]:
			edit_dist10= edit_dist10[:-1] + [edit_dist]
			top10= top10[:-1] + [c]
	print(top10)
	return top10
コード例 #39
0
ファイル: suggest.py プロジェクト: vishalbelsare/Cologne
def suggesthw(inputword):
	#hw1 = h.hw1()
	fin = codecs.open('hw11.txt','r','utf-8')
	hw1 = fin.readlines()
	hw1 = map(triming,hw1)
	if inputword in hw1:
		print "word found in hw1"
		return inputword
	else:
		output = []
		typicalheadwords = [member for member in hw1 if (re.search('^'+inputword[0],member) and len(inputword)==len(member))]
		for headword in typicalheadwords:
			output.append( (headword,lev.levenshtein(inputword,headword),initmatch(inputword,headword)) )
		output = sorted(output,key=lambda x: x[2], reverse=True)
		output = sorted(output,key=lambda x: x[1])
		leasteditdistance = output[0][1]
		leastinitmatch = output[0][2]
		return [(hw,edit,init) for (hw,edit,init) in output if edit==leasteditdistance]
コード例 #40
0
 def test(self):
     count = 0
     for incorrect, correct in self.ppairs_test:
         # Get the top 100 candidats with smallest levenshtein distance
         test_pairs = [(incorrect, candidate) for candidate in 
                       heapq.nsmallest(100, self.dictionary, key=lambda x: levenshtein.levenshtein(incorrect, x))]
         gx_test = self.fe.transform(test_pairs)
         # Pr is a list of probability, corresponding to each correction pair in test_pairs 
         pr = self.m.predict_proba(gx_test)
         cr = zip(pr, test_pairs)
         # We use the one with largest probability as the correction of the incorrect word
         cr = max(cr, key=lambda x: x[0][0])
         if cr[1][1] == correct:
             count += 1
         else:
             print (incorrect, correct),
             print cr[1][1]
         print
     print count/float(len(self.ppairs_test))
コード例 #41
0
ファイル: bayes.py プロジェクト: Zotek/PJN
 def __init__(self, corpfile, formcount, errorfile):
     self.word_count = 0
     self.word_occurences = {}
     self.accepted_form_count = formcount
     self.error_occurences = {}
     f = codecs.open(corpfile, encoding="utf-8")
     for line in f:
         line = self._prepare_line(line)
         for word in re.split("\\s", line):
             self.word_occurences[word] = self.word_occurences.get(word,
                                                                   0) + 1
             self.word_count += 1
     f = codecs.open(errorfile, encoding="utf-8")
     self.error_count = 0
     for line in f:
         self.error_count += 1
         splited = line.split(";")
         lev = levenshtein(splited[0], splited[1])
         self.error_occurences[lev] = self.error_occurences.get(lev, 0) + 1
コード例 #42
0
ファイル: allPythonContent.py プロジェクト: Mondego/pyreco
 def Results(self):
         '''
         Display all identified unique log event types
         
         @return None
         '''
         #if options.outfile == true: dump to file 
         print "\n========== Potential Unique Log Events ==========\n"
         self.BuildResultsTree(self.rootNode)
                             
         #Todo - commandline args to toggle levenshtein identification of dupes
         
         previous = ''          
         for entry in self.entries:
             if levenshtein.levenshtein(entry, previous) < ClusterGroup.VarDistance : 
                 print "\t" + entry
             else:
                 print entry
             previous = entry
def computeDifference(entry, malBehavior):
    """
	Compute the difference between the malicious segments and the inserted malicious behavior and check if the segment contains the malicious behavior
	:param entry: the most malicious segment of a trace
	:param malBehavior: malicious behavior that was inserted into the benign traces
	:type entry: string
	:type malBehavior: list
	:return: the results of the appearance and difference metric for the currently regarded malicious segment
	:rtype: string
	"""
    # convert string of calls of the most malicious segment into and array and compute the Levenshtein difference to the inserted malicious behavior
    calls = entry.split(';')[0]
    callsArray = ast.literal_eval(calls)
    distance = lev.levenshtein(callsArray, malBehavior)
    # check if the most malicious segment contains the inserted malicious behavior
    if checkForBehavior(callsArray, malBehavior):
        return '1;' + str(distance)
    else:
        return '0;' + str(distance)
コード例 #44
0
ファイル: fuzzylas.py プロジェクト: amunozNFX/fuzzylas
def guess_simple2(data, word, lim):
    words = []
    distances = []
    smallest = 100
    for w in data:
        distance = levenshtein.levenshtein(word, w)
        if distance <= smallest:
            words.insert(0, w)
            distances.insert(0, distance)
            smallest = distance
        else:
            words.append(str(w))
            distances.append(distance)

    output = {}
    for i in range(lim):
        output[words[i]] = data[words[i]]
        i += 1

    return output
コード例 #45
0
ファイル: sratim.py プロジェクト: michaelvilensky/homedir
def download_subtitles_for_path(path):
    video_filenames = list(get_videos(path))
    print "checking against %d video files" % len(video_filenames)
    for title, sub_id in get_last_subs():
        print "checking", title
        for video_filename in video_filenames:
            basename, _ = os.path.splitext(video_filename)
            score = levenshtein(title, basename)
            if score < LEVENSHTEIN_DIST_BOUND or basename.startswith(title) or title.startswith(basename):
                print title, basename, score
                [downloaded] = list(get_subtitle(sub_id, path))
                full_path_sub = os.path.join(path, downloaded)
                _, sub_ext = os.path.splitext(downloaded)
                full_path_vid = os.path.join(path, basename)
                name_for_sub = full_path_vid + sub_ext
                i = 0
                while os.path.exists(name_for_sub):
                    name_for_sub = full_path_vid + '.' + `i` + sub_ext
                    i += 1
                os.rename(full_path_sub, full_path_vid + sub_ext)
コード例 #46
0
ファイル: fuzzylas.py プロジェクト: rayosclx/fuzzylas
def guess_simple2(data,word,lim):
    words = []
    distances = []
    smallest = 100
    for w in data:
        distance = levenshtein.levenshtein(word,w)
        if distance <= smallest:
            words.insert(0,w)
            distances.insert(0,distance)
            smallest = distance
        else:
            words.append(str(w))
            distances.append(distance)

    output = {}
    for i in range(lim):
        output[words[i]] = data[words[i]]
        i += 1

    return output
コード例 #47
0
    def Results(self):
        '''
                Display all identified unique log event types
                
                @return None
                '''
        #if options.outfile == true: dump to file
        print "\n========== Potential Unique Log Events ==========\n"
        self.BuildResultsTree(self.rootNode)

        #Todo - commandline args to toggle levenshtein identification of dupes

        previous = ''
        for entry in self.entries:
            if levenshtein.levenshtein(entry,
                                       previous) < ClusterGroup.VarDistance:
                print "\t" + entry
            else:
                print entry
            previous = entry
コード例 #48
0
ファイル: urls_utils.py プロジェクト: kittle/htmlwipe
def grouping_list_by_levenstein(urls, key=lambda x: x):
    ret = []
    prev = None
    urls = sorted(urls, key=key)
    lret = []
    for url in urls:
        if prev:
            if levenshtein(key(prev), key(url)) == 1:
                if not lret:
                    lret.append(prev)
                lret.append(url)
            else:
                if lret:
                    ret.append(lret)
                    lret = []
                    #urls.insert(0, url)
        prev = url
    else:
        if lret:
            ret.append(lret)
    return ret
コード例 #49
0
 def analytic_score_sentences(self, sentence_tuples):
     return {'ref-lev': average([levenshtein(h, r) for h, r in sentence_tuples])} 
コード例 #50
0
ファイル: tests.py プロジェクト: mpeddle/jubilant-pancake
 def test_blank(self):
     self.assertEquals(levenshtein('', ''), 0)
コード例 #51
0
ファイル: find_match_utils.py プロジェクト: heolin123/scripts
def levenshtein(text1, text2):
    value = lev.levenshtein(text1, text2)
    return float(value)/float(len(text1))
コード例 #52
0
ファイル: main.py プロジェクト: bestonredbox/redbox-mashup
def download_movies(page):
    url = "https://api.redbox.com/v3/products/movies?pageSize=10&pageNum=%s&apiKey=%s"\
        % (page, REDBOX_APIKEY)
    logging.info("Fetching products...")
    try:
        response = fetch(url, headers={'Accept': 'application/json'})
        logging.info("complete!")
        movies = json.loads(response.content)
    except:
        movies = {}
    if 'Products' not in movies or \
            'Movie' not in movies['Products'] or \
            len(movies['Products']['Movie']) == 0:
        logging.info("Download complete!")
        return
    for obj in movies['Products']['Movie']:
        movie_id = obj['@productId']
        movie = Movie.get_by_id(movie_id)
        if movie is None:
            movie = Movie(id=movie_id)
        properties = {}
        for key in obj:
            if type(obj[key]) != dict:
                properties[key.replace('@','').lower()] = obj[key]
        movie.populate(**properties)
        if type(movie.title) != str and type(movie.title) != unicode:
            movie.title = unicode(movie.title)
        if 'RatingContext' in obj and \
                '@ratingReason' in obj['RatingContext']:
            movie.ratingReason = obj['RatingContext']['@ratingReason']
        if 'Actors' in obj and 'Person' in obj['Actors']:
            movie.actors = ", ".join(obj['Actors']['Person'])
        if 'BoxArtImages' in obj and 'link' in obj['BoxArtImages'] \
                and type(obj['BoxArtImages']['link']) == list \
                and len(obj['BoxArtImages']['link']) >= 3 \
                and '@href' in obj['BoxArtImages']['link'][2]:
            movie.thumb = obj['BoxArtImages']['link'][2]['@href']
        movie.put()

        # Don't recalc score if it's really bad
        if hasattr(movie, 'score') and movie.score < 50 and movie.score > 0:
            continue
        movie.score = -1

        # Then look up Rotten Tomatoes scores
        url = "http://api.rottentomatoes.com/api/public/v1.0/movies.json?q=%s&apikey=%s"\
            % (urllib.quote(unicodedata.normalize('NFKD', movie.title).encode('ascii', 'ignore')), RT_APIKEY)
        response = fetch(url)
        if response.status_code != 200:
            logging.error("Could not retrieve Rotten Tomatoes information for %s: %s" % (obj['Title'], url))
            content = '{"movies":{}}'
            if response.status_code == 403:
                return
        else:
            content = response.content
        for result in json.loads(content.strip())['movies']:
            if (not hasattr(movie, 'score') or movie.score == -1) and \
                    levenshtein(movie.title, unicode(result['title'])) / \
                    len(movie.title) < 0.2:
                # This is where the magic happens
                logging.info("Recalculating score for %s" % obj['Title'])
                movie.critics_score = result['ratings']['critics_score']
                movie.critics_consensus = result['critics_consensus'] \
                    if 'critics_consensus' in result else ''
                movie.audience_score = result['ratings']['audience_score']
                movie.score = int((
                    result['ratings']['critics_score'] +
                    result['ratings']['audience_score']
                ) / 2)

                if 'release_dates' in result and \
                        'dvd' in result['release_dates']:
                    movie.dvdreleasedate = result['release_dates']['dvd']
                if 'release_dates' in result and \
                        'theatre' in result['release_dates']:
                    movie.theatrereleasedate = \
                        result['release_dates']['theatre']

                # Adjust score based on release date
                try:
                    daysago = (datetime.now() - \
                        datetime.strptime(movie.dvdreleasedate, \
                        "%Y-%m-%d")).days
                except:
                    daysago = 90
                movie.daysago = daysago
                if daysago <= 30:
                    movie.score += 5
                if daysago <= 7:
                    movie.score += 10
                if daysago > 90:
                    movie.score -= 20
                if not hasattr(movie, 'score'):
                    movie.score = 0

                # Save Rotten Tomatoes metadata
                try:
                    movie.rottentomatoeslink = result['links']['alternate']
                except:
                    # This way, it always goes *at least* to the RT site,
                    # and we avoid putting more logic in the template.
                    movie.rottentomatoeslink = 'http://www.rottentomatoes.com/'

        # Save and return movie
        movie.put()
コード例 #53
0
ファイル: Treedistance.py プロジェクト: jfrancois/mam
 def similarity(self,val):
     import levenshtein
     return 1 - (levenshtein.levenshtein(self._val,val.get_val()) / float(max(len(self._val),len(val.get_val()))))
コード例 #54
0
ファイル: analyse.py プロジェクト: frabcus/judgmental
def write_metadata_to_sql(d,cursor,rel_judgment_dir):
    "Inserts judgment metadata to SQL database"

    # make sure there's a record for the court
    def get_court(court_name):
        cursor.execute('SELECT courtid,abbreviated_name,url FROM courts WHERE name = ?', (court_name,))
        result = cursor.fetchone()
        return result

    result = get_court(d["court_name"])
    if not result:
        abbreviated_court,d["court_name"] = min((levenshtein.levenshtein(d["court_name"], long), short, long) for (short, long) in courts.courts)[1:]
        result = get_court(d["court_name"])

    if result:
        (courtid,abbreviated_court,court_url) = result
    else:
        court_url = os.path.join(rel_judgment_dir,abbreviated_court+'/')
        
        # Find the correct court category
        courtcategory = False
        for category,l in courts.categories.iteritems():
            if abbreviated_court in l:
                courtcategory = category
                break
        if not courtcategory:
            raise StandardConversionError("something's going wrong: we can't find a courtcategory for " + abbreviated_court)
        cursor.execute('SELECT courtcategoryid FROM courtcategories WHERE name = ?', (courtcategory,))
        result = cursor.fetchone()
        if result:
            courtcategoryid = result[0]
        else:
            cursor.execute('INSERT INTO courtcategories (name) VALUES (?)',(courtcategory,))
            courtcategoryid = cursor.lastrowid

        cursor.execute('INSERT INTO courts(name, courtcategoryid, abbreviated_name,url) VALUES (?,?,?,?)', (d["court_name"],courtcategoryid,abbreviated_court,court_url))
        courtid = cursor.lastrowid
        
    # insert a record
    for judgmental_url in best_filename(d["date"].year, abbreviated_court, court_url, d["citations"]):
        try:
            cursor.execute('INSERT INTO judgments(title, date, courtid, filename, bailii_url, judgmental_url) VALUES (?, ?, ?, ?, ?, ?)', (d["title"], d["date"], courtid, d["filename"], d["bailii_url"], judgmental_url))
            break
        except sqlite.IntegrityError:
            pass
    judgmentid = cursor.lastrowid

    # store the citations
    for c in d["citations"]:
        cursor.execute('SELECT citationcodeid FROM citationcodes WHERE citationcode = ?', (c,))
        result = cursor.fetchone()
        if result:
            i = result[0]
        else:
            cursor.execute('INSERT INTO citationcodes(citationcode) VALUES (?)', (c,))
            i = cursor.lastrowid
        cursor.execute('INSERT INTO judgmentcodes(citationcodeid, judgmentid) VALUES (?, ?)', (i,judgmentid))

    # store the parties
    for (i,n) in d["parties"]:
        cursor.execute('INSERT INTO parties(position, name, judgmentid) VALUES (?, ?, ?)', (i,n,judgmentid))