Esempio n. 1
0
 def getSuccessors(self, statement, i, goalCoHash = None):
     if statement.type(i) == "conjunction" or statement.type(i) == "disjunction":
         thisType = statement.type(i)
         if thisType == "conjunction": otherType = "disjunction"
         else: otherType = "conjunction"
         successors = []
         if statement.type(i*2+2)==otherType: # ie p & (q v r); thisType=="conjunction", otherType = "disjunction"
             p = statement.childTree(i*2+1)
             q = statement.childTree(i*4+5)
             r = statement.childTree(i*4+6)
             if p==q or p==r:
                 successor = statement.graft(i,p)
                 successor.action = self.name
                 successor.cost = self.cost + distance(str(statement), str(successor))
                 successors.append(successor)
                 
         if statement.type(i*2+1)==otherType: # ie (q v r) & p; thisType=="conjunction", otherType = "disjunction"
             p = statement.childTree(i*2+2)
             q = statement.childTree(i*4+3)
             r = statement.childTree(i*4+4)
             if p==q or p==r:
                 successor = statement.graft(i,p)
                 successor.action = self.name
                 successor.cost = self.cost + distance(str(statement), str(successor))
                 successors.append(successor)
         if len(successors) == 1:
             return successors[0]
         elif len(successors) > 1:
             return successors
         else: return None
Esempio n. 2
0
 def getSuccessors(self, statement, i, goalCoHash = None):
     if statement.type(i) == "conjunction":
         if statement.type(i*2+1)=="false_constant":
             successor = statement.childTree(0)
             successor.prune(i)
             successor.insertProp(i, "false_constant")
             successor.action = self.name
             successor.cost = self.cost + distance(str(statement), str(successor))
             return successor
         elif statement.type(i*2+2)=="false_constant":
             successor = statement.childTree(0)
             successor.prune(i)
             successor.insertProp(i, "false_constant")
             successor.action = self.name
             successor.cost = self.cost + distance(str(statement), str(successor))
             return successor
     elif statement.type(i) == "disjunction":
         if statement.type(i*2+1)=="true_constant":
             successor = statement.childTree(0)
             successor.prune(i)
             successor.insertProp(i, "true_constant")
             successor.action = self.name
             successor.cost = self.cost + distance(str(statement), str(successor))
             return successor
         elif statement.type(i*2+2)=="true_constant":
             successor = statement.childTree(0)
             successor.prune(i)
             successor.insertProp(i, "true_constant")
             successor.action = self.name
             successor.cost = self.cost + distance(str(statement), str(successor))
             return successor
def get_min_score(tokens, img_name):
  min_dist = distance(tokens[0], img_name)
  for i in range(1,len(tokens)):
    dist = distance(tokens[i], img_name)
    if min_dist > dist:
      min_dist = dist
  return min_dist
Esempio n. 4
0
def search(start,goal,rules,verbose = False):
    goalStr = str(goal)
#    l = len(str(start))+len(str(goal))
    nodesExpanded = 0
    shortcuts = 0
    node = Node(start, None)
    node.cost = distance(str(node.state), goalStr)
    frontier = PriorityQueue()
    frontier.push(node,node.cost)
    explored = set()
    while not frontier.isEmpty():
        if nodesExpanded > 2000:
            raise TimeOutException()
        node = frontier.pop()
        if nodesExpanded%10==0:
            print nodesExpanded
        nodesExpanded += 1
        if node.state == goal:
#            print "expanded: ", nodesExpanded, " shortcuts: ", shortcuts
            print "expanded: ", nodesExpanded, " shortcuts: ", shortcuts
            return Derivation(start,goal,node.traceback(),rules)
        explored.add(node.state)
        for child in node.successors(rules,goal):
            h = distance(str(child.state), goalStr)
            if child.state not in explored and frontier.getCheapestCost(child) == -1:
                frontier.push(child, child.cost + h)
                if verbose: 
                    print child.cost, child.state, h
            elif frontier.getCheapestCost(child) > child.cost:
                shortcuts += 1
                frontier.push(child, child.cost + h)
    print "NOT LOGICALLY EQUIVALENT"
    return False
Esempio n. 5
0
def mate_pop(top, population):
    new_pop = []
    new_pop.append((distance(top, TARGET), top))
    for pair in population:
        string = mutate(crossover(pair[1], top))
        d = distance(string, TARGET)
        new_pop.append((d, string))
    new_pop.pop()
    new_pop.sort()
    return new_pop
Esempio n. 6
0
def return_operon_string_distance(operon_string, gene_string):
    len_operon = len(operon_string)
    len_gene_group = len(gene_string)
    length_difference = len_operon - len_gene_group
    
    reverse_gene_string = gene_string[::-1]
    
    d1 = distance(operon_string, gene_string) - length_difference
    d2 = distance(operon_string, reverse_gene_string) - length_difference
    
    return min(d1, d2)
Esempio n. 7
0
 def getSuccessors(self, statement, i, goalCoHash = None):
     #maybe make commutativity discount its cost when it's close to the goalCoHash, using the older equivalence hashing method that ignores order for commuativ operators
     
     if statement.type(i) == "conjunction" or statement.type(i) == "disjunction":
         left = statement[i*2+1]
         right = statement[i*2+2]
         successor = statement.graft(i*2+1,right)
         successor.graftInPlace(i*2+2,left)
         successor.action = self.name
         if statement.cohash()==goalCoHash: successor.cost = self.cost + distance(str(statement), str(successor))
         else: successor.cost = self.cost + 10*distance(str(statement), str(successor))
         return successor
Esempio n. 8
0
 def match(self, equipe):
     if self.categorie and not self.categorie.valide(equipe):
         return False
     equipiers_challenge = Equipier.objects.filter(equipe__challenges__participation=self)
     c = 0
     equipiers = equipe.equipier_set.all()
     for e in equipiers:
         for e2 in equipiers_challenge:
             if e.justificatif == 'licence' and e2.justificatif == 'licence' and e.num_licence == e2.num_licence:
                 c += 1
             elif distance(e.nom.lower(), e2.nom.lower()) < 3 and distance(e.prenom.lower(), e2.prenom.lower()) < 3:
                 c += 1
     return c >= len(equipiers) / 2
Esempio n. 9
0
 def getSuccessors(self, statement, i, goalCoHash = None):
     if statement.type(i) == "negation":
         if statement.type(i*2+1) == "conjunction" or statement.type(i*2+1) == "disjunction":
             thisType = statement.type(i*2+1)
             if thisType == "conjunction": otherType = "disjunction"
             else: otherType = "conjunction"
             
             np = statement.negatedChildTree(i*4+3)
             nq = statement.negatedChildTree(i*4+4)
             
             ns = Statement(dict(),statement.propMap)
             ns.insertProp(0, otherType)
             ns.graftInPlace(1,np)
             ns.graftInPlace(2,nq)
             successor = statement.graft(i,ns)
             successor.action = self.name
             successor.cost = self.cost + distance(str(statement), str(successor))
             return successor
     elif statement.type(i) == "conjunction" or statement.type(i) == "disjunction":
         thisType = statement.type(i)
         if thisType == "conjunction": otherType = "disjunction"
         else: otherType = "conjunction"
         
         if self.dangerous:
             np = statement.negatedChildTree(i*2+1)
             nq = statement.negatedChildTree(i*2+2)
             
             ns = Statement(dict(),statement.propMap)
             ns.insertProp(0, "negation")
             ns.insertProp(1, otherType)
             ns.graftInPlace(3,np)
             ns.graftInPlace(4,nq)
             successor = statement.graft(i,ns)
             successor.action = self.name
             successor.cost = self.cost + distance(str(statement), str(successor))
             return successor
         
         else:
             if statement.type(i*2+1) == "negation" and statement.type(i*2+2) == "negation":
                 p = statement.childTree(i*4+3)
                 q = statement.childTree(i*4+5)
                 ns = Statement(dict(),statement.propMap)
                 ns.insertProp(0, "negation")
                 ns.insertProp(1, otherType)
                 ns.graftInPlace(3,p)
                 ns.graftInPlace(4,q)
                 successor = statement.graft(i,ns)
                 successor.action = self.name
                 successor.cost = self.cost + distance(str(statement), str(successor))
                 return successor
Esempio n. 10
0
 def getSuccessors(self, statement, i, goalCoHash = None):
     if statement.type(i) == "conjunction" or statement.type(i) == "disjunction":
         thisType = statement.type(i)
         if thisType == "conjunction": otherType = "disjunction"
         else: otherType = "conjunction"
         successors = []
         if statement.type(i*2+2)==otherType: # ie p & (q v r); thisType=="conjunction", otherType = "disjunction"
             p = statement.childTree(i*2+1)
             p2 = statement.childTree(i*2+1)
             q = statement.childTree(i*4+5)
             r = statement.childTree(i*4+6)
             
             successor = statement.childTree(0)
             successor.prune(i)
             successor.insertProp(i,otherType)       # _ v _
             successor.insertProp(i*2+1,thisType)    # (_ & _) v _
             successor.insertProp(i*2+2,thisType)    # (_ & _) v (_ & _)
             successor.graftInPlace(i*4+3,p)         # (p & _) v (_ & _)
             successor.graftInPlace(i*4+4,q)         # (p & q) v (_ & _)
             successor.graftInPlace(i*4+5,p2)        # (p & q) v (p2 & _)
             successor.graftInPlace(i*4+6,r)         # (p & q) v (p2 & r)
             successor.action = self.name
             successor.cost = self.cost + distance(str(statement), str(successor))
             successors.append(successor)
             
             
         if statement.type(i*2+1)==otherType:
             p = statement.childTree(i*2+2)
             p2 = statement.childTree(i*2+2)
             q = statement.childTree(i*4+3)
             r = statement.childTree(i*4+4)
             
             successor = statement.childTree(0)
             successor.prune(i)
             successor.insertProp(i,otherType)       
             successor.insertProp(i*2+1,thisType)    
             successor.insertProp(i*2+2,thisType)    
             successor.graftInPlace(i*4+3,p)    
             successor.graftInPlace(i*4+4,q)     
             successor.graftInPlace(i*4+5,p2)    
             successor.graftInPlace(i*4+6,r)    
             successor.action = self.name
             successor.cost = self.cost + distance(str(statement), str(successor))
             successors.append(successor)
         if len(successors) == 1:
             return successors[0]
         elif len(successors) > 1:
             return successors
         else: return None
Esempio n. 11
0
def check_distances(combined_hcv, report_file=None):
    references = [(header, sequence.replace('-', ''))
                  for header, sequence in iterate_fasta(combined_hcv)
                  if header.startswith('Ref.')]

    combined_hcv.seek(0)
    samples = ((header, sequence.replace('-', ''))
               for header, sequence in iterate_fasta(combined_hcv)
               if header.startswith('Sample.'))
    for header, sequence in samples:
        reported_genotype = header.split('-')[-1]
        reported_ref = best_ref = min_distance = reported_distance = None
        reported_size = best_size = 0
        for ref_header, ref_seq in references:
            ref_genotype = ref_header.split('-')[-1]
            d = distance(sequence, ref_seq)
            if (ref_genotype == reported_genotype and
                    (reported_distance is None or d < reported_distance)):
                reported_distance = d
                reported_ref = ref_header
                reported_size = len(ref_seq)
            if min_distance is None or d < min_distance:
                min_distance = d
                best_ref = ref_header
                best_size = len(ref_seq)
        if min_distance != reported_distance:
            best_genotype = best_ref.split('-')[-1]
            print(f'Reported {reported_genotype}, but {best_genotype} is '
                  f'closer: {header}(0/{len(sequence)}), '
                  f'{reported_ref}({reported_distance}/{reported_size}), '
                  f'{best_ref}({min_distance}/{best_size}).',
                  file=report_file)
Esempio n. 12
0
def extract_option(text):
    """
    Return the Option referenced by ``text`` (in various fuzzy ways), or
    raise a ValueError if none could be found. This function tries hard
    to find an Option, as sort-of documented by the ``tests`` module.
    """

    if not isinstance(text, basestring):
        raise TypeError("Not a basestring: %r" % text)

    t = unicode(text).lower()
    matches = []

    for option in Option.objects.all():
        l = option.letter.lower()
        c = option.caption.lower()

        # return early if this is an exact match.
        if (t == l) or (t == c):
            return option

        # otherwise, compile a list of distances.
        d = distance(t, c)
        if (d is not None) and (d <= settings.MAX_MATCH_DISTANCE):
            matches.append((option, d))

    # return the closest option.
    if len(matches):
        m = sorted(matches, key=lambda x: x[1])
        return m[0][0]

    raise ValueError("No Option could be found in: %s" % text)
def find_similar_names(search_name, base, default_distance):
	similar_names = list()
	for mpid in base.keys():
		# mpid, name, link, party, ticket, district,\
		# rid, rdate, urid, urdate, urreason,\
		# bio, profile, party12, ticket12, link12,\
		# district12, did12, dlink12, loh, lohcom,\
		# corrupt, autobio, biolink, decl, decllink

		name = base[mpid][1]
		district = base[mpid][5]

		dist = list()
		for pair in zip(search_name, name):
			search_name_el, name_el = pair
			if len(search_name_el) == 1:
				name_el = name_el[0:1]
			current_dist = distance(search_name_el, name_el)
			dist.append(current_dist)

		if len(search_name) == len(name) and sum(dist) == 0:
			return [[mpid, name, district]]

		if all(d < default_distance for d in dist):
			similar_names.append([mpid, name, district])
	return similar_names
Esempio n. 14
0
    def anomalies(self, request):
        request.current_app = self.name
        uid = request.COOKIES['course_uid']
        course = Course.objects.get(uid=uid, accreditations__user=request.user)
        equipiers = list(Equipier.objects.filter(equipe__course=course).select_related('equipe__categorie'))

        doublons = []
        for i, e in enumerate(equipiers):
            if e.numero > e.equipe.nombre:
                continue
            dbl = []
            for j in range(i + 1, len(equipiers)):
                e2 = equipiers[j]
                if e2.numero > e2.equipe.nombre:
                    continue
                if distance((e.nom + ' ' + e.prenom).lower(), (e2.nom + ' ' + e2.prenom).lower()) < 3:
                    dbl.append(e2)
            if dbl:
                dbl.insert(0, e)
                doublons.append(dbl)
        print(doublons)

        return TemplateResponse(request, 'admin/anomalies.html', dict(self.each_context(request),
            doublons=doublons,
            course=course,
        ))
Esempio n. 15
0
def fix_garbage_sugar(pairs):
    betterPairs = []
    i = -1
    done = False

    for pair in pairs:
        slide = (len(pair[0]) - 6) + 1
        if(slide < 1):
            slide = 1
        for i in range(slide):
            temp = pair[0]
            #print(temp[i:(5 + i)])
            dist = distance(Keywords.label.sugars, temp[i:(5 + i)])
            if dist <= 2:
                i = pair[2]
                done = True
                break
        if(done):
            break

    for pair in pairs:
        name = pair[0]
        if(pair[2] == i):
            name = Keywords.label.sugars
        betterPairs.append((name, pair[1], pair[2]))

    return betterPairs
Esempio n. 16
0
def generate_aliases(table, ref_list, match_list, dist_limit=3):
    comps = 0.0
    total_comps = float(max(1, len(ref_list) * len(match_list)))
    for ref in ref_list:
        if not table.find_one(name=ref["name"]):
            table.insert({"name": ref["name"], "fp": ref["fp"], "canonical": ref["name"]})

        for match in match_list:
            dist = distance(match["fp"], ref["fp"])
            comps += 1.0
            if comps and comps % 100000 == 0:
                pct_comps = int((comps / total_comps) * 100)
                print "%s matching: %s%%" % (table.table.name, pct_comps)
            if dist < dist_limit:
                if not table.find_one(name=match["name"]):
                    table.insert(
                        {
                            "name": match["name"],
                            "fp": match["fp"],
                            "candidate": ref["name"],
                            "distance": dist,
                            "canonical": match["name"],
                        }
                    )
                # print 'Match? %r -> %r' % (ref['name'], match['name'])

    write_aliases(table)
Esempio n. 17
0
    def search_by_similar_name(self, genus, species):
        """
        Search for Species with a similarly spelled name as the given name.

        This method can help correct spelling mistakes in species names.
        """
        matches = self.filter(
            genus__startswith=genus[:2],
            genus__endswith=genus[-2:],
            species__startswith=species[:2],
            species__endswith=species[-2:]
        )
        complete_name = u" ".join((genus, species))
        min_match = 10
        min_match_species = None
        for match in matches:
            d = distance(complete_name, unicode(match))
            if d < min_match:
                min_match = d
                min_match_species = match

        if min_match_species and d < 3:
            species_by_fullname[complete_name] = min_match_species
        else:
            species = Species.objects.create(genus=i[0], species=i[1])
            species_by_fullname[complete_name] = species
Esempio n. 18
0
    def clustering(self, elems):
        """
        Clusterize the input elements.

        Input: list of words (e.g. list of URLs). It MUST be sorted!

        Process: build a dictionary where keys are cluster IDs (int) and
                 values are lists (elements in the given cluster)
        """
        clusters = {}
        cid = 0

        for i, line in enumerate(elems):
            if i == 0:
                clusters[cid] = []
                clusters[cid].append(line)
            else:
                last = clusters[cid][-1]
                if distance(last, line) <= DISTANCE:
                    clusters[cid].append(line)
                else:
                    cid += 1
                    clusters[cid] = []
                    clusters[cid].append(line)
        #
        self.clusters['clusters'] = clusters
        self.clusters['clusters']['largest'] = self.get_largest_cluster()
        self.clusters['clusters']['number_of_clusters'] = cid + 1
Esempio n. 19
0
def getTemplateNoDiac(word):
	"""This function take an Arabic word as parameter;
	it deletes its diacritics if exist; then returns the possible Templates;
	If there are many possible templates, they will be separated with + """

	template = u""
	minDistance = 1000
	word_u = deleteDiacritics(word)
	word_u = unicode(word_u)
	for wazn in wazns:
		wazn_u = deleteDiacritics(wazn)
		wazn_u = deleteRoot(wazn_u)
		wazn_u = unicode(wazn_u)
		if len(wazn_u) != len(word_u):
			continue

		#print "distance(" + word_u + "," + wazn_u + ")"
		distanceI = distance(word_u, wazn_u)
		if distanceI < minDistance:
			if re.match(wazn_u, word_u):
				minDistance = distanceI
				template = wazn
			continue
		if distanceI == minDistance:
			if re.match(wazn_u, word_u):
				template = template + '+' + wazn
	return template
def optional_check():
    "Optionally check for sentences that we failed to match."
    print 'Not found:'
    not_found = {s for h,s in negation_sentence_hashes.items() if not h in found}
    print '\n'.join(not_found)

    from Levenshtein import distance

    print ''
    print 'Computing Levenshtein distances to find candidates we could have missed.'

    found_missing = False
    for split in ['train']:
        for key in old_file[split]:
            for description in old_file[split][key]['descriptions']:
                for sentence in not_found:
                    h1 = sentence_hash(sentence)
                    h2 = sentence_hash(description)
                    if distance(h1, h2) < 15:
                        found_missing = True
                        print 'POSSIBLE MATCH:'
                        print sentence
                        print description
                        print '----------------------------------'

    if not found_missing:
        print 'Matched all we could possibly match.'
Esempio n. 21
0
def main():
    try2()
    return
    population = []
    population = givemepop(population)
    print population
    for i in range(0, CYCLES):

#    for i in range(0, POP_SIZE):
#        s = givemestring(len(TARGET))
#        d = distance(s, TARGET)
#        population.append((d, s))
#    population.sort()

        mated = crossover(population[0][1], population[1][1])
        d = distance(mated, TARGET)
        new_pop = []
        new_pop.append((d, mated))
        new_pop.append(population[0])
        new_pop.append(population[1])
#    for i in range(0, POP_SIZE - len(new_pop)):
#        s = givemestring(len(TARGET))
#        d = distance(s, TARGET)
#        new_pop.append((d, s))
#    new_pop.sort()
        new_pop = givemepop(new_pop)
        print new_pop
        population = new_pop
Esempio n. 22
0
def admin_season(selected_season):
    # Get and validate season
    seasons = config.get_all_seasons()
    if not selected_season in seasons:
        abort(404, "Season data not found")
    season_data = config.get_season_data(selected_season)

    # Do we need to recalculate points?
    if 'action' in request.form and request.form['action'] == 'recalculate_points':
        season_data.calc_and_store_points()
        return jsonify()

    # Otherwise, just display the season page. Find similar drivers in real-time, before we do this
    results_table = season_data.get_results_for_class()
    d = sorted([row["driver"] for row in results_table.table])

    similar_drivers = []
    for i in xrange(len(d)):
        for j in xrange(i + 1, len(d)):
            dist = distance(d[i], d[j])
            if dist <= 4:
                similar_drivers.append({"name1": d[i], "name2": d[j], "distance": dist})
    similar_drivers = sorted(similar_drivers, key=itemgetter("distance"))

    driver_name_corrections = config.get_driver_name_corrections(selected_season)

    return render_template("admin.html", seasons=seasons, selected_season=selected_season, season_data=season_data,
                           similar_drivers=similar_drivers, driver_name_corrections=driver_name_corrections)
Esempio n. 23
0
    def get_departures_by_station(self, station):
        """ Get list of Departures for one station
        """

        # TODO 1. Error handling
        # TODO 2. more error handling
        # TODO 3. ultimative error handling

        station = station.encode('UTF-8')
        html = urlopen(defaults.departures_by_station % quote_plus(station)).read()

        li = BeautifulSoup(html).ul.findAll('li')

        if li[0].a:
            # calculate levenshtein distance of results
            st = map(lambda x: (distance(station, x.a.text.encode('UTF-8')), x.a.text.encode('UTF-8'), x.a['href']), li)
            # take result with lowest levenshtein distance
            s = min(st)
            lnk = s[2]
            
            if len(st) > 1:
                print "Multiple results found, using best match:", s[1]
            
            html = urlopen(defaults.qando + lnk).read()

        dep = self.parse_departures_by_station(html)

        return dep
Esempio n. 24
0
def calc_similarity(word1, word2):
    len1 = len(word1)
    len2 = len(word2)
    l = len1
    if l < len2:
        l = len2
    return 100 - (100 / l) * distance(word1, word2)
Esempio n. 25
0
File: 01-basic.py Progetto: lhl/misc
  def unshred(self, output):
    distances = {}
    totheright = {}

    for key in self.strips:
      min_k = None
      min_d = None
      for key2 in self.strips:
        if key != key2:
          d = distance(self.strips[key]['right'], self.strips[key2]['left'])
          if min_k == None:
            min_k = key2
            min_d = d
          else:
            if d < min_d:
              min_k = key2
              min_d = d
      print '... strip %d closest match is %d (%d)' % (key, min_k, min_d) 
      distances[key] = min_d
      totheright[key] = min_k

    right_most = max(distances, key=distances.get)
    print 'We think that strip %d is the is the right-most strip!' % right_most

    del(totheright[right_most])
    pprint(totheright)
    self.ordered = [right_most]
    while totheright:
      for key in totheright.keys():
        if totheright[key] == self.ordered[0]:
          self.ordered.insert(0, key)
          del(totheright[key])
    print 'Here\'s our order:', self.ordered

    self._save(output)
def test_convert():
    # There are 4 amino acid changes, but because codons are chosen randomly,
    # based on experimental tests, there could be anywhere between 8 and 12
    # changes, inclusive.

    d = distance(str(np.src_nt.seq), str(np.des_nt.seq))
    assert d <= 12
    assert d >= 8
Esempio n. 27
0
def matching(a, b):
    if a.lower() in b.lower() or b.lower() in a.lower(): # substring
        return True
    if get_jaccard(a, b) >= .3:
        return True
    if distance(a, b) < 3:
        return True
    return False
Esempio n. 28
0
 def getSuccessors(self, statement, i, goalCoHash = None):
     if statement.type(i) == "conjunction" or statement.type(i) == "disjunction":
         left = statement[i * 2 + 1]
         if left == statement[i * 2 + 2]:
             successor = statement.graft(i, left)
             successor.action = self.name
             successor.cost = self.cost + distance(str(statement), str(successor))
             return successor
Esempio n. 29
0
def givemepop(initialpop):
    result = initialpop
    for i in range(len(initialpop), POP_SIZE):
        s = givemestring(len(TARGET))
        d = distance(s, TARGET)
        result.append((d, s))
    result.sort()
    return result
Esempio n. 30
0
def check_perms(names1, names2):
  # try removing one until distance is low enough
  for name in names1:
    spliced = filter(lambda n: n != name, names1)
    if distance(' '.join(spliced), ' '.join(names2)) <= thres:
      #print 'YAY, found match by trying permutations! (%s, %s)' % (' '.join(spliced), ' '.join(names2))
      return True
  return False
Esempio n. 31
0
def safe_distance(a, b):
    a = RX_DOT.sub('', a)
    b = RX_DOT.sub('', b)
    if a == b:
        return 0
    if len(a) == 0 or len(b) == 0:
        return max(len(a), len(b)) + 1
    if a[0] == b[0] and len(a) == 1 and len(b) > 3:
        return 0
    if a[0] == b[0] and len(b) == 1 and len(a) > 3:
        return 0
    if len(a) < 3 or len(b) < 3:
        return max(len(a), len(b)) + 1
    return distance(a, b)
    def test_levenshtein(self):
        eq(distance('a', 'ab'), 1)  # number of additions, deletions, updates

        eq(ratio('a', 'b'), 0)  # in [0, 1]
        eq(ratio('a', 'a'), 1)

        eq(setratio(['a', 'b'], ['b', 'a']),
           1.0)  # in [0, 1] compares two sets by best fit, order doesnt matter
        eq(setratio(['c', 'd'], ['b', 'a']),
           0)  # in [0, 1] compares two sets by best fit, order doesnt matter

        eq(seqratio(['a', 'b'], ['b', 'a']), 0.5)  # in [0, 1]
        eq(seqratio(['a', 'b'], ['a', 'b']), 1.0)  # in [0, 1]
        eq(seqratio(['a'], ['a', 'b']), 2 / 3)
Esempio n. 33
0
def find_nearest(title, title_dict):
    if title is None:
        return None
    if title in title_dict:
        return title_dict[title]

    min_dist = EDIT_DISTANCE_RATIO_THRESHOLD
    res = None
    for key in title_dict.keys():
        dist = distance(title, key) / max(len(title), len(key))
        if dist < min_dist:
            min_dist = dist
            res = title_dict[key]
    return res
Esempio n. 34
0
def compare_metadata(prev, potential, mood):
  linked = zip(prev['metadata'], potential['metadata'])
  diffs = []
  for idx, items in enumerate(linked):
    old, new = items
    if idx == 5:
      # dates
      olddate, newdate = 0,0
      if len(old) == 4:
        olddate = int(old)
      if len(new) == 4:
        newdate = int(new)
      diffs.append(np.abs(newdate-olddate))
    else:
      try:
        if old != "" and new != "":
          diffs.append(distance(unicode(old), unicode(new)) / float(len(old) + len(new) + 1) )
        else:
          diffs.append(100000)
      except:
        diffs.append(distance((old), str(new)))
  weighted_diffs = map(lambda x: x[0]*x[1], zip(diffs, mood['metadata']))
  return weighted_diffs
Esempio n. 35
0
def test_khash(xs, D2, attempts=1e6):
    n = len(xs)
    tests = 0
    for _ in range(int(attempts)):
        i, j = np.random.randint(n, size=2)
        a, b = xs[i], xs[j]
        d = distance(a, b)
        if 0 < d < 3:
            key = tuple(sorted((a,b)))
            if key not in D2:
                print 'fuckyou'
            else:
                tests += 1
    return tests
Esempio n. 36
0
def one_char_typosquatting(s_a='', s_b=''):
    """
    function searches for one character typosquatting for strings of length at least 4
    
    types of one char typosquatting:
        inplace one char:  
            paypal -> paypel
            paypal -> paypai
            paypal -> qaypal
        
        inflate one char: 
            paypal -> paypal2
            paypal -> payypal
            paypal -> ppaypal
        
        deflate one char:
            paypal -> payal
            paypal -> papal    
            
        switched neighbour chars:
            paypal -> papyal
            paypal -> payapl
    """
    if not s_a or not s_b or s_a == s_b:
        # nothing to compute
        return False

    if len(s_a) < 4 and len(s_b) < 4:
        return False

    # Levenshtein distance handle inplace, inflate and deflate one char
    if distance(strip_accents(s_a), strip_accents(s_b)) == 1:
        return True

    # try for find switched neighbours
    if not len(s_a) == len(s_b):
        return False

    for i in range(0, len(s_a) - 1):
        t = s_a[i:i + 2][::-1]
        switched_neighbours = ''.join((
            s_a[:i] if i > 0 else '',
            t,
            s_a[i + 2:],
        ))

        if switched_neighbours == s_b:
            return True

    return False
Esempio n. 37
0
def match_maker(query, unknown):
    """
    This little ditty gives us some wiggle room in identifying our indices and any other small targets.
    :param query
    :param unknown
    :return:
    """

    query_mismatch = distance(query, unknown)

    # Unknown length can be longer than target length.  Need to adjust mismatch index to reflect this.
    adjusted_query_mismatch = query_mismatch-(len(unknown) - len(query))

    return adjusted_query_mismatch
Esempio n. 38
0
def eval_unnatural(stem, most_freq):
    """
    高頻度の単語に発音が似ていないか
    """
    stem_meta = dmeta(stem)[0]
    most_freq_meta = [dmeta(x)[0] for x in most_freq]
    distances = [distance(stem_meta, x) for x in most_freq_meta]

    if 1 in distances:
        # 高頻度の単語に似ている.規則化すると不自然
        return -5.0
    else:
        # 高頻度の単語に似ていない
        return 0
Esempio n. 39
0
 def last_chance(self, kw):
     w = self.dictionnary
     no_find = True
     i = 0
     while no_find:
         test_wd = clean(basify(kw)).lower()
         keep = np.where(np.asarray([distance(test_wd, s) for s in w]) == i)
         if len(keep[0]) > 0:
             no_find = False
             final = keep[0][0]
         i = i + 1
         if i > 3:
             final = 'unk'
     return final
Esempio n. 40
0
    def suggest_v1(self, word):
        # intrie = lambda x: x in self.trie or x in self.secondary_trie or x in self.book_trie
        def intrie(candidate):
            if (candidate in self.secondary_trie or candidate in self.trie
                    or candidate in self.book_trie):
                return True
            else:
                return False

        candidates = list(self.edits1(word) or self.edits2(word))
        in_dictionary = list(filter(intrie, candidates))
        suggestions = sorted(in_dictionary, key=lambda x: distance(x, word))
        n = min(10, len(suggestions))
        return suggestions
def single_barcode_adj(b, BClist):
    if b in BClist:
        return [b, "match"]
    else:
        dis_to_refer = []
        for referBC in BClist:
            dis_to_refer.append(distance(b, referBC))
        sorted_dis_to_refer = sorted(dis_to_refer)
        if sorted_dis_to_refer[0] <= 1 and \
           sorted_dis_to_refer[1] - sorted_dis_to_refer[0] >= 1:# \
            #and dis_to_refer.index(sorted_dis_to_refer[0]) <= 1:
            return [BClist[dis_to_refer.index(sorted_dis_to_refer[0])], "adj"]
        else:
            return ["NA", "rm"]
Esempio n. 42
0
def filterLevenshtein(msg, filterWords, englishWords, levenshteinDistance):
    #Filter special characters from string
    filteredMsg = ''.join(e for e in msg if e.isalnum())
    #for each word in filter list
    for word in filterWords:
        #for all criteria check if it's a proper english word before filtering it
        #If a word matches a filter criteria, and is not an english word, the asterisk string is returned
        if abs(len(word) - len(filteredMsg) <= levenshteinDistance):
            if distance(word, filteredMsg) <= levenshteinDistance:
                if not WordChecker.check_word_exists_in(
                        englishWords, filteredMsg):
                    return generateRandomAsteriskString()
    # Otherwise, return original string
    return msg
Esempio n. 43
0
def _get_closest_string(string,
                        iterable,
                        length_dependant: bool = True,
                        preprocess=lambda s: s.lower()):
    string = preprocess(string)
    iterable = list(filter(lambda x: x != None, iterable))
    distances = sorted({
        s: distance(string, preprocess(s)) /
        (max(len(preprocess(s)), 0.01) if length_dependant else 1)
        for s in iterable
    }.items(),
                       key=lambda i: i[1])
    if len(distances) > 0: return distances[0][0]
    return string
Esempio n. 44
0
 def score(
         self, query_meta: Dict, old_match_scores: Dict, match_meta: Dict
 ) -> "np.ndarray":
     from Levenshtein import distance
     new_scores = [
         (
             match_id,
             -distance(
                 query_meta['text'], match_meta[match_id]['text']
             ),
         )
         for match_id, old_score in old_match_scores.items()
     ]
     return np.array(new_scores, dtype=np.float64)
Esempio n. 45
0
def weather():      #天气现象查询
    print('\n本脚本只提供查询编码功能')
    while 1:
        dic={'露':'01','霜':'02','结冰':'03','大风':'15','积雪':'16','雾凇':'48','雨凇':56,'冰雹':89,'霾':'05','浮尘':'06','扬沙':'07','轻雾':10,'沙尘暴':31,'雾':42,'毛毛雨':50,'雨':60,'雨夹雪':68,'雪':70,'阵雨':80,'阵性雨夹雪':83,'阵雪':85}
        key=input('\n请输入要查询的天气现象,或输入-1退出:')
        if key=='-1':break
        try:
            print('编码为:',dic[key])
        except KeyError as e:
            print('未找到相关关键词:',e,'\n\n你可能是想写:')
            for i in dic.keys():
                f=distance(key.encode('unicode_escape'),i.encode('unicode_escape'))
                if f<5:
                    print(i,end='  ')
Esempio n. 46
0
def levenshtein(string, candidates):
    """
    Compare a string's Levenshtein distance to each candidate in a dictionary. Expands the length of each candidate to match the length of the compared string
    Returns the name of the closest match
    """

    distances = defaultdict(int)
    num_lines = len(string)

    for k, v in candidates.items():
        expanded = (v * (num_lines // len(v) + 1))[:num_lines]
        distances[k] += distance(string, expanded)

    return get_lowest(distances)
Esempio n. 47
0
def get_reduced_distances(chunk, edit_distance):
    """
    Now that we've reduced the data set, we need to actually look at these tags
    that appear to have the most other tags some > min(edit_distance) from
    them.  We're going to do this by comparing each tag to the "base" tag
    that got it included in this set to begin with, and also comparing each 
    tag that we keep to all other tags that we keep to ensure that none are
    less than min(edit_distance from one another).  This was a stuggle to do
    simply and without consuming LOTS of RAM (e.g. numpy arrays), but the
    solution is rather simple.
    """
    all_keepers = []
    # get only those tags, compared to the base tag that have 
    # edit_distance >= our minimum - we're essentially regenerating
    # and filtering the pairwise comparisons above
    good_comparisons = [c for c in chunk[1] if \
        distance(chunk[0],c[1]) >= edit_distance]
    # we know that the first tag is good (it is the basis for comparison),
    # so keep that one
    keepers = [chunk[0]]
    # now, loop over all the tags in the reduced set, checking each against
    # the tags already in 'keepers' for the proper edit distance
    for tag in good_comparisons:
        #pdb.set_trace()
        temp_dist = []
        skip = False
        for keep in keepers:
            d = distance(keep,tag[1])
            if d < edit_distance:
                skip = True
                # no need to continue if we're already < edit_distance
                break
        if not skip:
            keepers.append(tag[1])
    # see docstring for pickler
    tf = pickler(keepers)
    return tf
Esempio n. 48
0
def iterate_insde_dict(collected_words_list, handled_ids, word, lemma, pos,examined_word_len, letter_count_dict, search_range, debug = DEBUG):
    collected_words = 0
    #print(word, examined_word_len, "search_range", search_range)
    if str(examined_word_len) in letter_count_dict and examined_word_len > 2:
        compare_words = letter_count_dict[str(examined_word_len)]
        random.shuffle(compare_words)
        for word_compare_el in compare_words:
            word_compare = word_compare_el[0]
            if "unknown" not in word_compare_el[1]:#SPECIFIC
                pos_compare = word_compare_el[1]
            else:
                pos_compare = None
            lemm_comapre = word_compare_el[2]
            comp_word_id = word_compare_el[3]
            comp_ref_id = word_compare_el[4]
            comp_set_id = word_compare_el[5]
            if search_range > 3:
                pos_search_range = search_range
            else:
                pos_search_range = 1
            
            if pos and pos_compare:
                if distance(lemm_comapre, lemma) > 0 and distance(lemm_comapre, lemma)  <= search_range and distance(pos_compare, pos) <= pos_search_range and comp_word_id not in handled_ids:
                    if debug:print("FOUND VS POS", word,word_compare,lemma, lemm_comapre, pos,pos_compare)
                    collected_words_list.append({"word_id":comp_word_id, "ref_id":comp_ref_id,"setting_id":comp_set_id, "ngramm": word_compare})
                    handled_ids.append(comp_word_id)
                    collected_words += 1
                    if (collected_words > 10 or len(collected_words_list) > 12):break
            else:
                if distance(lemm_comapre, lemma) > 0 and distance(lemm_comapre, lemma)  <= search_range and comp_word_id not in handled_ids:
                    if debug:print("FOUND NON POS", word,word_compare,lemma, lemm_comapre)
                    collected_words_list.append({"word_id":comp_word_id, "ref_id":comp_ref_id,"setting_id":comp_set_id, "ngramm": word_compare})
                    handled_ids.append(comp_word_id)
                    collected_words += 1
                    if (collected_words > 10 or len(collected_words_list) > 12):break
    #print("collected_words_list",collected_words_list)
    return collected_words
Esempio n. 49
0
def buildListDict(input_file, distance_stringency, pickleOut):
    #Dict format: {'UMI_1' : (Seqs, First_Header, First_quality), 'UMI_2' : (Seqs, First_Header, First_quality)}
    sequences = defaultdict(lambda:([],[],[]))
    target = open(input_file, 'r')
    umi_list = []
    position = 1
    is_unique = True

    for line in target:
        if position == 1:
            header = line.rstrip('\n')
            position += 1
        elif position == 2:
            #Assumes UMI is flanking first and last 6bp of read
            #umi_seq = line[0:11]+line[-7:] #Abs dist from start/end compatible with miSeq/hiSeq
            umi_seq = line[0:11]+line.rstrip('\n')[-11:] #Abs dist from start/end compatible with miSeq/hiSeq
            umi_seq = umi_seq.rstrip('\n')
            read_seq = line.rstrip('\n')[6:-6]
            position += 1
        elif position == 3:
            position += 1
        elif position == 4:
            quality = line.rstrip('\n')
            position = 1

            if not bool(umi_list):
                umi_list.append(umi_seq)
            else:
                is_unique = True
                for umi in umi_list:
                    if is_unique:
                        if distance(umi_seq, umi) <= distance_stringency:
                            is_unique = False
                            umi_seq = umi

            # it is important for duplex collapsing to make sure reads are of the same length
            # when not duplex collapsing this should always be true
            if not is_unique and len(sequences[umi_seq][0][0]) == len(read_seq):
                sequences[umi_seq][0].append(read_seq)
            elif is_unique:
                sequences[umi_seq][0].append(read_seq)

            #check if header slot is empty prevents multiple entries error
            if is_unique and not bool(sequences[umi_seq][1]):
                sequences[umi_seq][1].append(header)
                sequences[umi_seq][2].append(quality)

    target.close()
    return sequences
Esempio n. 50
0
def attach_UMI_barcode(Read1, Read2, barcodes, mismatch_rate=1):
    mismatch_rate = int(mismatch_rate)
    f1 = open(Read1)
    f2 = open(Read2)

    line1 = f1.readline()
    line2 = f2.readline()

    while (line1):
        line1 = f1.readline()
        target = line1[0:6]
        mismatch = [
            distance(target, barcodes[idx]) for idx in range(len(barcodes))
        ]

        if (min(mismatch) <= mismatch_rate):
            barcode = barcodes[mismatch.index(min(mismatch))]
            output_file = barcode + ".align.fastq"
            f3 = open(output_file, "a")

            UMI = line1[6:12]
            first_line = "@" + barcode + "," + UMI + "," + line2[1:]
            f3.write(first_line)

            second_line = f2.readline()
            f3.write(second_line)

            third_line = f2.readline()
            third_line = "+" + barcode + "," + UMI + "," + third_line[1:]
            f3.write(third_line)

            fourth_line = f2.readline()
            f3.write(fourth_line)

            line2 = f2.readline()

        else:
            line2 = f2.readline()
            line2 = f2.readline()
            line2 = f2.readline()
            line2 = f2.readline()

        line1 = f1.readline()
        line1 = f1.readline()
        line1 = f1.readline()

    f1.close()
    f2.close()
    f3.close()
Esempio n. 51
0
def calc_distance_matrix(data):
    """Calculate a distance matrix between languages.
    1. Calculates Levensthein distance for every word between two languags (assuming they both have
    the given concept)
    2. Normalizes the Levensthein distance: words_d / longest_word_d
    3. Calculates language distance: word1_d + word2_d + ... + wordn_d / amount of words

    Returns: the distance matrix (Pandas DataFrame)
    """
    d_matrix = np.zeros(shape=(len(data.keys()), len(data.keys())))
    i = 0

    for lang1, lang1_dict in data.items():
        j = 0
        for lang2, lang2_dict in data.items():
            lv_distances = []

            for w_concept1, word1_l in lang1_dict.items():
                word2_l = lang2_dict.get(w_concept1, None)

                # if word concept not in list
                if word2_l is None:
                    continue

                dis_list = []

                # some words have more than one translation for a given concept
                # I take the smallest distance as given one
                for w1 in word1_l:
                    for w2 in word2_l:
                        dis_list.append(distance(w1, w2))

                lv_distances.append(np.min(dis_list))

            longest_wd = np.max(lv_distances)

            if longest_wd > 0:
                lv_distances = [x / longest_wd for x in lv_distances]

            lang_dist = np.sum(lv_distances) / len(lv_distances)

            d_matrix[i][j] = lang_dist
            j += 1
        i += 1

    d = DataFrame(d_matrix)
    d.index = data.keys()
    d.columns = data.keys()
    return d
Esempio n. 52
0
    def distance(self, other):
        p1 = self.get_item_parts()
        p2 = other.get_item_parts()
        # drop Id:
        p1 = p1[1:]
        p2 = p2[1:]

        dists = []
        for i in range(len(p1)):
            s1 = p1[i] if p1[i] != "" else None
            s2 = p2[i] if p2[i] != "" else None
            if (s1 is not None) and (s2 is not None):
                dists.append(float(distance(s1, s2))/max(len(s1), len(s2)))

        return dists
Esempio n. 53
0
    def insert(self, word):
        """Returns parent of new node."""
        if self.word is None:
            self.word = word
            return 'none'
        if self.word == word:
            return 'not_inserted'
        else:
            dist = distance(word, self.word)
            for child in self.children:
                if child[1] == dist:
                    return child[0].insert(word)

            self.children.append((SpellTree(word), dist))
            return self.word
Esempio n. 54
0
def find_music(app, root):
    tolaunch=""
    minimum=999999
    path = root
    papth = os.walk(path)
    for root, dirnames, filenames in os.walk(path):
        for filename in filenames:
            filePath = os.path.join(root, filename)
            r=int(distance(filename, app))
            #print("app : "+str(name)+" distance : "+str(r))
            if r<minimum:
                tolaunch=filePath
                minimum=r
                print("actual best : "+str(tolaunch)+ " distance : "+str(minimum))
    os.system("open \""+tolaunch+"\"")
 def get_syn_sim(self, node1, node2):
     """
     根据传入的节点返回句法相似性
     :param node1:
     :param node2:
     :return:
     """
     node1 = node1[1]['node_name']
     node2 = node2[1]['node_name']
     distance_val = distance(node1, node2)
     if len(node1) == 0 and len(node2) == 0:
         return 1
     else:
         syn_sim = 1 - distance_val / max(len(node1), len(node2))
         return syn_sim
Esempio n. 56
0
def validate_password_dictionary(value):
    """
    Insures that the password is not too similar to a defined set of dictionary words
    """
    password_max_edit_distance = getattr(
        settings, "PASSWORD_DICTIONARY_EDIT_DISTANCE_THRESHOLD", None)
    password_dictionary = getattr(settings, "PASSWORD_DICTIONARY", None)

    if password_max_edit_distance and password_dictionary:
        for word in password_dictionary:
            edit_distance = distance(text_type(value), text_type(word))
            if edit_distance <= password_max_edit_distance:
                raise ValidationError(
                    _("Too similar to a restricted dictionary word."),
                    code="dictionary_word")
Esempio n. 57
0
def compute_speaker_Levenshtein_distance(speaker_name):
	full_speaker_names = read_names("APnames.xlsx")
	# speaker_last_names = read_names("an_last_names.xls")

	distance_size = {}
	for i, speaker in enumerate(full_speaker_names['Full Name']):
		# Levenshtein distance
		# speaker = unicodedata.normalize("NFKD", speaker).encode("ascii", "ignore")
		dist = distance(speaker, speaker_name)
		distance_size[speaker] = dist

	for j, speaker in enumerate(full_speaker_names.index.values):
		# Levenshtein distance
		# speaker = unicodedata.normalize("NFKD", speaker).encode("ascii", "ignore")
		dist = distance(speaker, speaker_name)
		full_name = full_speaker_names["Full Name"].iloc[j]
		if full_name in distance_size:
			if dist < distance_size[full_name]:
				distance_size[full_name] = dist
		else:
			distance_size[full_name] = dist
	dist_size_sorted = sorted(distance_size.items(), key = lambda kv: kv[1])

	return dist_size_sorted[:2]
Esempio n. 58
0
 def closest_hexameter_patterns(self, scansion: str) -> list:
     """Find the closest group of matching valid hexameter patterns.
     :return: list of the closest valid hexameter patterns; only candidates with a matching
     length/number of syllables are considered."""
     pattern = scansion.replace(" ", "")
     pattern = pattern.replace(self.constants.FOOT_SEPARATOR, "")
     ending = pattern[-1]
     candidate = pattern[:len(pattern) - 1] + self.constants.OPTIONAL_ENDING
     cans = [(distance(candidate, x), x) for x in self.VALID_HEXAMETERS
             if len(x) == len(candidate)]
     if cans:
         cans = sorted(cans, key=lambda tup: tup[0])
         top = cans[0][0]
         return [can[1][:-1] + ending for can in cans if can[0] == top]
     return []
Esempio n. 59
0
 def search(self, word):
     original = word
     word = word.lower()
     candidates = {}
     if (self.inList(word)):
         candidates[self.get(word)] = 0
     else:
         edits = self.edits(word)
         for word in edits.values():
             if (self.inList(word) and word[0] == original[0].lower()):
                 word, original = self.get(word).lower(), original.lower()
                 d = distance(word, original)
                 l = len(original)
                 candidates[self.get(word)] = d / l * 100
     return candidates
Esempio n. 60
0
def get_LD(i, j):
    '''
    Calculate sequence distance between a pair of Seq objects
    '''
    # pairwise2 is used to force 'gapless' distance when sequence pair is of the same length
    if i.junc_len == j.junc_len:
        identity = pairwise2.align.globalms(i.junc, j.junc, 1, 0, -50, -50,
                                            score_only=True,
                                            one_alignment_only=True)
        if type(identity) != float:
            identity = 0.0
        return i.junc_len - identity
    # Levenshtein distance is used for sequence pairs of different lengths
    else:
        return distance(i.junc, j.junc)