def is_chain(venue_id):

    vs = VenueSearcher()

    venue_data = vs.get_venue_json(venue_id)

    if vs.venue_has_chain_property(venue_data):
        return True

    global_venues = vs.global_search(venue_data['name'])

    global_similar_name_count = len(filter(lambda x: ratio(x, venue_data['name']) > 0.95, [venue['name'] for venue in global_venues]))

    if len(global_venues) > 1 and global_similar_name_count > 0:
        global_proportion = float(len(global_venues))/global_similar_name_count
    else:
        global_proportion = 0

    local_venues = vs.local_search(venue_data, venue_data['name'], 5000)
    local_similar_name_count = len(filter(lambda x: ratio(x, venue_data['name']) > 0.95, [venue['name'] for venue in local_venues]))

    if len(local_venues) > 1 and local_similar_name_count > 0:
        local_proportion = float(len(local_venues))/local_similar_name_count
    else: 
        local_proportion = 0

    if global_proportion > 0.9 or local_proportion > 0.9:
        return True
    else:
        return False
                def infer_function(question, new, old):
                    # default map translations that need no manual confirmation( this should go to a separate file later)
                    default_map = {
                        'Repeated collection(more than once)': 'Repeated collection (specify frequency and/or time interval) ',
                        'Subgroup analyzed (eg. Dementia)': 'Subgroup analyzed (eg. Dementia, please specify subgroup)'
                    }

                    try:
                        if ratio(unicode(default_map[old]), unicode(new)) > 0.97:
                            return True

                        return False

                    except KeyError:
                        print "Not default mapping, manual input required"

                    input = None


                    # Ignore low scores automatically
                    if ratio(old, new) < float(options['ignore']):
                        return False

                    while not (input == 'y' or input == 'n'):
                        print """The number of new choices missing processing for question %s is 1, there could be a non obvious match.\n

                        Is '%s' a change of '%s' ? (y/n)
                        """ % (question, new, old)
                        input = raw_input()

                    if input == 'y':
                        return True

                    return False
예제 #3
0
    def cleanCommodities(self, data):
        for i in xrange(len(data)):
            if not data[i][0] is None:
                mindist = 100
                topcomm = ""
                alternatives = []
                for comm in self.comm_list:
                    dist = distance(data[i][0].value, unicode(comm))
                    if dist < 7:
                        alternatives.append((unicode(comm), dist))
                    if dist < mindist:
                        mindist = dist
                        topcomm = comm
                    if dist == 0:
                        data[i][0].value = topcomm
                        data[i][0].confidence = 1.0
                        break
                #print unicode(data[i][0].value)
                #print topcomm
                #print
                alternatives.sort(key=lambda x: x[1])
                optional_values = [j[0] for j in alternatives]
                
                maxdist = 4
                if len(data[i][0].value) < 5:
                    maxdist = 3

                if mindist < maxdist:
                    data[i][0].value = topcomm
                    if mindist < 2:
                        data[i][0].confidence = 1.0
                    else:
                        data[i][0].confidence = 0.7
                    if mindist != 0:
                        data[i][0].optional_values = [data[i][0].value] + optional_values
                else:
                    data[i][0].confidence = 0.0
                    data[i][0].optional_values = [data[i][0].value] + optional_values
            # LOW MED HIGH
            if not data[i][4] is None:
                topratio = 0.0
                toplev = ""
                for lev in self.levels[self.lang]:
                    rat = ratio(data[i][4].value, unicode(lev))
                    if rat > topratio:
                        topratio = rat
                        toplev = lev
                data[i][4].value = toplev
            if not data[i][6] is None:
                topratio = 0.0
                toplev = ""
                for lev in self.levels[self.lang]:
                    rat = ratio(data[i][6].value, unicode(lev))
                    if rat > topratio:
                        topratio = rat
                        toplev = lev
                data[i][6].value = toplev
예제 #4
0
파일: match.py 프로젝트: albins/music-tools
def levenshtein_ok(fl_title, fl_artist, ls_artist, ls_title):

    title_closeness = ratio(fl_title, ls_title)

    # TODO We should really examine these fields and see which are most unique.
    # We should also examine various threshold values
    if title_closeness > 0.80:
        artist_closeness = ratio(fl_artist, ls_artist)
        if artist_closeness > 0.80:
            return True
        else:
            return False
예제 #5
0
	def match_platform(self, test_platform):
		test_platform = test_platform.lower()
		hi_score = 0
		best_match_platform = {}
		for current_platform in platform.full_list:
			Lratio = ratio(test_platform, current_platform['name'].lower())
			if ratio(test_platform, current_platform['shortcode'].lower()) > Lratio: Lratio = ratio(test_platform, current_platform['shortcode'].lower()) 
			if ratio(test_platform, current_platform['alias'].lower()) > Lratio: Lratio = ratio(test_platform, current_platform['alias'].lower())
			if Lratio > hi_score: 
				hi_score = Lratio
				best_match_platform = current_platform
		return best_match_platform
def checkForRumorWords(uLang,tLang,text):
    #keywords = []
    if uLang != "es" and tLang != "es":
        #keywords = readKeyWords("keyword2")
	return False
    flag = False    
    text = removePunctuations(text)
    text = unidecode(text)
    for word1 in keywords:
	if len(word1) >2:

            for word2 in text.split(" "):
                if ratio(word1.lower(),word2.lower()) >= 0.9:
		    #print "1~~~~~~~~~~~~~~~~~~~~~~~~~~~"
		    #print word1
		    #print word2
		    #print "~~~~~~~~~~~~hello~~~~~~~~~~~~~~~~"
                    flag = True
                    break
            if flag:
                break
	     
            for pairs in nltk.bigrams(text):
                pairWords = pairs[0]+' '+pairs[1]
                if ratio(word1.lower(), pairWords.lower()) >= 0.8:
                    #print "2~~~~~~~~~~~~~~~~~~~~~~~~~~~"
		    #print word1
		    print pairWords
		    #print "~~~~~~~~~~~~hello~~~~~~~~~~~~~~~~"
                    flag = True
                    break
            if flag:
                break

	     
	    for tris in nltk.trigrams(text):
	        trisWords = tris[0]+ ' ' + tris[1] + ' '+tris[2]
	        if ratio(word1.lower(), trisWords.lower()) >=0.8:
		    #print "3~~~~~~~~~~~~~~~~~~~~~~~~~~~"
		    #print word1
		    print trisWords
		    #print "~~~~~~~~~~~~hello~~~~~~~~~~~~~~~~"
                    flag = True
                    break
	    if flag:
	        break
		
    return flag
예제 #7
0
def computeMeanAveragePrecision(robot, layer, media, shots, qRelevant):

    # load submission
    qReturned = []
    for medium in media:
        for annotation in robot.getAnnotations(layer=layer, medium=medium):
            shot = annotation.fragment
            if shot not in shots:
                continue
            personName = annotation.data.person_name
            confidence = annotation.data.confidence
            qReturned.append((shot, personName, confidence))

    # sort submitted shot in decreasing confidence order
    qReturned = sorted(qReturned, key=lambda s: s[2], reverse=True)

    # per query average precision
    qAveragePrecision = {}
    for query, relevant in qRelevant.iteritems():
        # filter shots by Levenshtein distance to query
        returned = [s for s, p, _ in qReturned if ratio(query, p) >= 0.95]
        # average precision for this query
        qAveragePrecision[query] = computeAveragePrecision(returned, relevant)

    # mean average precision
    mAP = np.mean(qAveragePrecision.values())
    return mAP
예제 #8
0
def calculate_similarity(str1, str2):
    """
    # Calculate document TF-IDF
    d_tfidf = dict()
    token_counts = self.doci[ID]
    max_count = max(token_counts.itervalues())
    for term in token_counts:
        # TF: Raw frequency divided by the maximum raw frequency
        # of any term in the document.
        tf = token_counts[term] / max_count
        # IDF: Total number of documents in the corpus divided by
        # the number of documents where the term appears.
        idf = math.log(len(self.doci) / self.dict.dfs[term])
        d_tfidf[term] = tf * idf
    # Calculate inner product
    inner_product = 0
    for term in terms:
        if term in token_counts:
            inner_product += q_tfidf[term] * d_tfidf[term]
    # Calculate query length
    query_length = 0
    for term in q_tfidf:
        query_length += q_tfidf[term] ** 2
    query_length = math.sqrt(query_length)
    # Calculate document length
    doc_length = 0
    for term in d_tfidf:
        doc_length += d_tfidf[term] ** 2
    doc_length = math.sqrt(doc_length)
    # Calculate the cosine similarity
    cosine_sim = inner_product / (query_length * doc_length)
    ranked_pages[ID] = cosine_sim
    """
    return ratio(str1, str2)
예제 #9
0
def find_suitable_el(name, collection):
    """
    Finds and returns most string from collection using Levenshtein ratio algorithm
    """
    best_score, el = max((ratio(name, el), el) for el in collection)
    if best_score >= 0.65:
        return el
예제 #10
0
def find_similar_words(db, config, request):
    """Edit distance function."""
    # Check if lookup is cached
    hashed_query = hashlib.sha256()
    hashed_query.update(request['q'].encode("utf8"))
    hashed_query.update(str(request.approximate_ratio).encode('utf8'))
    approximate_filename = os.path.join(config.db_path, "data/hitlists/%s.approximate_terms" % hashed_query.hexdigest())
    if os.path.isfile(approximate_filename):
        with open(approximate_filename, encoding="utf8") as fh:
            approximate_terms = fh.read().strip()
            return approximate_terms
    query_groups = get_all_words(db, request)
    file_path = os.path.join(config.db_path, "data/frequencies/normalized_word_frequencies")
    new_query_groups = [set([]) for i in query_groups]
    with open(file_path, encoding="utf8") as fh:
        for line in fh:
            line = line.strip()
            try:
                normalized_word, regular_word = line.split('\t')
                for pos, query_group in enumerate(query_groups):
                    for query_word in query_group:
                        if ratio(query_word, normalized_word) >= float(request.approximate_ratio):
                            new_query_groups[pos].add(regular_word)
            except ValueError:
                pass
    new_query_groups = ' '.join([" | ".join(group) for group in new_query_groups])
    cached_file = open(approximate_filename, "w", encoding="utf8")
    cached_file.write(new_query_groups)
    return new_query_groups
예제 #11
0
def number_match(fileparse):
    synonyms = set({u'number', u'integer', u'figure', u'digit', u'character', u'symbol',
    u'cardinal', u'ordinal', u'amount', u'quanity', u'total', u'aggregate', u'tally', u'quota',
    u'limit'})
    pattern = r'[\d\s]+'
    
    for cid in {k: v for k,v in fileparse.nps.items() if not v.get('ref')}:
        check_for_number = False
        for syn in synonyms:
            if ratio(fileparse.nps[cid]['text'].lower(), syn) > .9:
                check_for_number = True
        if not check_for_number:
            continue
 
        numbers = []
        for parse in fileparse.parses:
            numbers.extend(findall(pattern, parse.text))
        longest = ''
        if numbers:
            for num in numbers:
                if len(num) > len(longest):
                    longest = num
        if longest:
            aid = _get_cid(fileparse.nps, longest, cid)
            if not aid:
                aid = _mk_coref_id()
                data = {'text': longest, 'ref': None}
                fileparse.nps[aid] = data
            fileparse.nps[cid]['ref'] = aid
 def _build_match_list(self, query):
     # Build a list of tuples containing the similarity ratio ([0, 1])
     ranked = [(ratio(txd, query), d_id, txd, d)
               for (d_id, txd, d) in self._data]
     # Sort the results by ratio (descending)
     ranked.sort(key=itemgetter(0), reverse=True)
     return ranked
예제 #13
0
def process_mean_distance_labels(parent, current, differ):
    changed = differ.changed()
    if not changed:
        return 0.0
    distance = 0
    for lang in changed:
        distance += 1 - ratio(current.labels[lang], parent.labels[lang])
    return distance / len(changed)
예제 #14
0
파일: spell.py 프로젝트: JKodner/spell
def find_lst(word, lst, rat=0.8, ld=2, let=0.75):
	"""This function returns words similar in spelling that are in in the specified 
	'lst' parameter to the inputted 'word' parameter.

	The 'rat' parameter specifies what Levenshtein word similarity ratio the words'
	similarity have to be greater than or equal to (defaults to 0.8). The 'ld' parameter
	specifies the maximum length difference between the words (defaults to 2). The 
	'let' parameter inputs a percentage in decimal form of how many similar letters are 
	shared (defaults to 0.75 which is 75%).

	Note: Function is case-insensitive"""

	from Levenshtein import ratio
	from math import floor
	if not isinstance(lst, list):
		raise ValueError("Inputted 'lst' must be list")
	if not isinstance(word, str):
		raise ValueError("Inputted 'word' must be string.")
	if not isinstance(rat, (int, float)) or (rat > 1) or (rat < 0):
		raise ValueError("Inputted 'rat' must be integer/float and be <= 1 and >= 0")
	if not isinstance(ld, int) or not (0 <= ld):
		raise ValueError("Inputted 'ld' must be integer and be >= 0")
	if not isinstance(let, (int, float)) or (let > 1) or (let < 0):
		raise ValueError("Inputtud 'led' must be integer/float and be <= 1 and >= 0")
	for i in lst:
		if not isinstance(i, str):
			raise ValueError("All values in 'lst' must be string")
	word = word.lower()
	lst = map(lambda x: x.lower(), lst)
	sim = []
	for i in lst:
		if ratio(word, i) >= rat:
				if abs(len(word) - len(i)) <= ld:
					num = 0
					for x in i:
						if x in word:
							num += 1
					if num >= floor(len(word) * let):
						sim.append([ratio(word, i), i])
	sim.sort()
	sim = sim[::-1]
	result = []
	for i in sim:
		result.append(i[1])
	return result
예제 #15
0
def calc_venue_match_confidence(venue1, venue2):

    """
    calculates distance between two venues by comparing names, 
    social media handles, URLs and categories
    """

    # just need the venue data, not the whole API response
    if venue1.get('response'):
        v1 = venue1['response']['venue']
    else:
        v1 = venue1
    if venue2.get('response'):
        v2 = venue2['response']['venue']
    else:
        v2 = venue2

    #levenshtein distance of names
    name_distance = ratio(v1['name'], v2['name'])
    url_match = 0.0
    social_media_match = 0.0
    category_match = 0.0

    # compare URLs
    if v1.get('url') and v2.get('url'):
        if v1['url']:
            if urlparse(v1['url']).netloc:
                if urlparse(v2['url']).netloc:
                    if urlparse(v1['url']).netloc == urlparse(v2['url']).netloc:
                        url_match = 1.0

    # compare social media
    if v1.get('contact') and v2.get('contact'):
        if v1['contact'].get('twitter') and v2['contact'].get('twitter'):
            if v1['contact']['twitter'] == v2['contact']['twitter'] and v1['contact']['twitter'] and v1['contact']['twitter'] != "none":
                social_media_match += 1.0
        if v1['contact'].get('facebook') and v2['contact'].get('facebook'):
            if v1['contact']['facebook'] == v2['contact']['facebook'] and v1['contact']['facebook'] and v1['contact']['facebook'] != "none":
                social_media_match += 1.0

    # compare categories if names match - match = +1.0, - no match = -1.0
    if name_distance > 0.9:
        c1 = set()
        c2 = set()
        if v1.get('categories') and v2.get('categories'):
            for category in v1['categories']:
                c1.add(category)
            for category in v2['categories']:
                c2.add(category)
        common = c1 & c2
        if len(common) > 0:
            category_match = 1.0
        else:
            category_match = -1.0

    return name_distance, url_match, social_media_match, category_match
    
예제 #16
0
def process_mean_distance_desc(parent, current, differ):
    changed = differ.changed()
    if not changed:
        return 0.0
    distance = 0
    for lang in changed:
        distance += (
            1 - ratio(current.descriptions[lang], parent.descriptions[lang]))
    return distance / len(changed)
def similarity_finder2(line, global_lang, thd=0.9):
    sim = {}
    for address in global_lang:
        line_ratio = ratio(global_lang[address], line)
        if line_ratio >= thd:
            line_ratio = ceil(line_ratio * 100)
            line2 = global_lang[address]
            sim.setdefault(line_ratio, [])
            sim[line_ratio].append(line2)
    return sim
예제 #18
0
파일: notes_manager.py 프로젝트: rgs1/notes
 def _do_match(self, needle, note_attrib_value):
     for token_note in note_attrib_value.split():
         for token_needle in needle.split():
             result = ratio(unicode(token_note), unicode(token_needle))
             if self.debug:
                 print "Searching match for %s against %s got: %f" % \
                     (token_needle, token_note, result)
             if result >= self.match_threshold:
                 return True
     return False
def find_minister(text):
    best_ratio = 0
    best_minister = ''
    for m in ministers:
        r = ratio(text, m)
        if r > best_ratio:
            best_ratio = r
            best_minister = m

    return best_minister
def checkForRetwitterUser(text):

    text  = unidecode(text)
    flag = False
    for word1 in keyPerson:
	if ratio(word1.lower(), text.lower())>0.9:
	    flag = True
	    break

    return flag
예제 #21
0
 def search_similar(self, line, thd=0.5):
     search_result = {}
     for filename in self.data:
         for index in self.data[filename]:
             line_ratio = ratio(line, self.data[filename][index])
             if line_ratio >= thd:
                 line_ratio = round(line_ratio * 100)
                 search_result.setdefault(line_ratio, [])
                 search_result[line_ratio].append((filename, index))
     return search_result
예제 #22
0
파일: views.py 프로젝트: yankees714/direct
    def similarity_to_query(s):
        query_lower = query.lower()

        # Lolz
        keywords = ["gym", "dining", "james bond"]
        if any(query_lower == w for w in keywords) and s.fname == "Franco":
            return -1

        fields = (s.fname, s.lname, s.full_name(), s.su, s.email, s.apt)
        return min(-ratio(f.lower(), query_lower) for f in fields if f)
예제 #23
0
파일: compare.py 프로젝트: badrange/ssr
def identify(feature):
  lon, lat = map(float,feature['geometry']['coordinates'])
  sname    = feature['properties']['enh_snavn']
  forname  = feature['properties']['for_snavn']
  ssrid    = feature['properties']['enh_ssr_id']

  #queryByName = (XAPI_URL + NAME_Q) % (lon-TOL, lat-TOL, lon+TOL, lat+TOL, quote_plus(sname))
  queryWoName = (XAPI_URL) % (lon-TOL, lat-TOL, lon+TOL, lat+TOL)
  
  osm = tree.parse(queryWoName)
  
  names = osm.findall(".//tag[@k='name']")

  status[ssrid] = {}
  status[ssrid]['found']=False
  bestratio = 0

  for name in names:
    osmname = unicode(name.get('v'))
    osmid   = name.getparent().get('id')
    osmlon  = name.getparent().get('lon') # only works for nodes, or we'll a) have to fetch references b) find a better distance calculation
    osmlat  = name.getparent().get('lat')
    if osmlon:
      dx = float(osmlon)-lon
      dy = float(osmlat)-lat
      distance = sqrt(dx*dx+dy*dy) # GIS people are allowed to simplify like this
    else:
      distance = float("inf")
    if sname == osmname or forname == osmname:
      if status[ssrid]['found']: # multiple matches
        status[ssrid]['nodes'].append({"osmid":osmid, "distance":distance})
      else:
        status[ssrid]['found']=True
        status[ssrid]['nodes']=[{"osmid":osmid, "distance":distance}]
        print "IDENTIFIED", osmname
    else:
      delta = max( ratio(osmname, sname), ratio(osmname, forname) )
      if delta > bestratio:
        status[ssrid]['bestmatch'] = {"osmname":osmname, "osmid":osmid, "levenshtein":delta}
        bestratio = delta
  if not status[ssrid]['found']:
    print "Not found:", sname, "Best match:", str(status[ssrid].get('bestmatch',"None"))
예제 #24
0
파일: clipAdaptor.py 프로젝트: gzluo/NGS
def cliA(line, lenline, adapt, lenadapt, short_mm_len, extend,
        editDratio, shortReadsLen):
    reads = line
    #-----Begin full adaptor-----------------------
    tmpAdapt = adapt
    exactPos = line.rfind(tmpAdapt)
    if exactPos != -1 and exactPos > shortReadsLen:
        if extend == 'yes':
            return line[:exactPos]
        else:
            if exactPos + lenadapt == lenline:
                return line[:exactPos]
        #---------------------------------------------
    else:
        if extend == 'yes':
            end = lenadapt + shortReadsLen - 1 # start, no less than
                                               # shortReadsLen
        else:
            end = lenline -1
        for j in range(lenline, end, -1):
            tmpReads = line[j-lenadapt:j]
            if ratio(tmpReads, tmpAdapt) >= editDratio:
            # and tmpReads[0] == tmpAdapt[0]:
                return line[:j-lenadapt]
        #------------mismatch----------------
    #---------------End full adaptor----------------
    #--------------Begin partly clipped----------
    for i in range(lenadapt-1, short_mm_len, -1):
        tmpAdapt = adapt[:i]
        exactPos = line.rfind(tmpAdapt)
        if exactPos != -1 and exactPos+i == lenline:
            return line[:exactPos]
        else:
            tmpReads = line[lenline-i:lenline]
            if ratio(tmpReads, tmpAdapt) >= editDratio:
                #and \
                #tmpReads[0] == tmpAdapt[0]:
                return line[:lenline-i]
            #------------mismatch----------------
        #---------------End full adaptor----------------
    #--no process-----------------------------------
    return reads
예제 #25
0
def select_query(a, search_title):
    if search_title:
        from Levenshtein import ratio as ratio
        return (ratio(a['title'].encode("utf-8"),
                      search_title.encode("utf-8")) > 0.50) \
                      and a['text'] and a['author'] \
                      and a['location'] and a['type'] == "Highlight"
    else:
        return a['title'] and a['text'] \
               and a['author'] and a['location'] \
               and a['type'] == "Highlight"
예제 #26
0
def suggest_lang(word):
    _cache = cache[args.lang]
    if not word in _cache:
        suggestions = speller.suggest(word)
        
        result = [(suggestion, r) for suggestion, r in 
            ((suggestion, ratio(word, suggestion)) for suggestion in suggestions[:3] if ' ' not in suggestion and '-' not in suggestion)
            if (r > 0.9)
            ]
        _cache[word] = result
    return _cache[word]
예제 #27
0
 def near_match(self, token_a, token_b):
     result = self.match(token_a, token_b)
     if result==0:
         return 0
     r = ratio(token_a.token_string, token_b.token_string)
     # print(str(token_a)+" "+str(token_b)+" "+str(r))
     if r > 0.6: 
         return 1
     else:
         return -1
     pass
예제 #28
0
 def _get_most_suitable_player(self):
     """
     Looks in the roster for a player with an almost identical name. If any, it 
     returns it
     """
     score, pl_name = max((ratio(pl_name, self.name), pl_name) 
                                             for pl_name in self.team_info.players_.keys())
     _pl_name = self.name.split(' ')[0]
     _suit_player_name = pl_name.split(' ')[0]
     if _pl_name[:3] in _suit_player_name[:3] and score >= 0.65:
         return pl_name
예제 #29
0
 def computeData(self, results):
   """Choses the record/artist couple with the best correspondancy."""
   records = results['recording-list']
   score_list = []
   # Adding every record which title have a high enough similarity ratio.
   for record in records:
     title_r = ratio(record['title'], self.localData.title)
     if title_r >= MinSimilarityRatio:
       score_list.append({'title_r':title_r,'record':record,'artist_r':0,'artist':''})
   # Finding the best artist correspondance for each record.
   best_artist=''
   best_ratio = 0
   best_element = {'title_r':0, 'record':records[0], 'artist_r':0, 'artist':self.localData.artist}
   # Doesn't try to compare artists if no one has been given.
   if(self.localData.artist != ""):
     for element in score_list:
       best_local_name = ''
       best_local_ratio = 0
       for artist_credit in element['record']['artist-credit']:
         local_name = artist_credit['artist']['name']
         local_ratio = ratio(artist, self.localData.artist)
         # Locally maximizing the ratio for each artist corresponding to the record.
         if local_ratio > best_local_ratio:
           best_local_name = local_name
           best_local_ratio = local_ratio
         # We are trying to maximise the best ratio.
         if local_ratio > best_ratio:
           best_element = element
           best_ratio = local_ratio
           element['artist'] = best_local_name
           element['artist_r'] = local_ratio
     if best_ratio > MinSimilarityRatio:
       record = best_element
       self.remoteData.title = record['record']['title']
       self.remoteData.artist = record['artist']
     else:
       self.remoteData.artist = ''
       self.remoteData.title = ''
   else:
     self.remoteData.title = score_list[0]['record']['title']
     self.remoteData.artist = score_list[0]['record']['artist-credit'][0]['artist']['name']
예제 #30
0
def get_similar_by_levens(incorrect):
    """Take all words of similar distance
    and return five most similar as measured
    by Levenstein distance"""
    vals = {}
    WORDS = get_smlar_len_words(incorrect)
    for correct in WORDS:
        similarity_ratio = ratio(incorrect, correct)
        if similarity_ratio > 0.65:
            vals[correct] = similarity_ratio
    all_suggestions = sorted([c for c, v in vals.items()], key=lambda v: v)
    return all_suggestions[:5]
예제 #31
0
def first_sentence_score(dataset: list, refer_dataset):
    """
    no useful
    :param dataset:
    :param refer_dataset:
    :return:
    """
    similarity_score_list = []
    for sample in dataset:
        sample_refer_dataset = random.sample(refer_dataset, 50)
        first_sentence = sample['essay_sent'][0]

        refer_first_sentences = [sample['essay_sent'][0] for sample in sample_refer_dataset]
        similarities = [ratio(first_sentence, sent) for sent in refer_first_sentences]
        scores = [refer_sample['domain1_score'] for refer_sample in sample_refer_dataset]
        similarity_score = np.average([similarity * score for similarity, score in zip(similarities, scores)])
        sample['first_sentence_score'] = similarity_score
        similarity_score_list.append(similarity_score)

    return {'first_sentence_score': {'mean': np.mean(similarity_score_list), 'std': np.std(similarity_score_list)}}
예제 #32
0
    def lookup(self, s):
        if not self.d:
            self.load()

        try:
            print "Using self"
            return self.d[s.lower()]
        except KeyError:
            try:
                print "Using stemmer: " + self.stemmer.stem(s).lower()
                return self.d[self.stemmer.stem(s).lower()]
            except KeyError:
                try:
                    print "Using lemmatizer: " + self.lemmatizer.lemmatize(
                        s).lower()
                    return self.d[self.lemmatizer.lemmatize(s).lower()]
                except KeyError:
                    (score, match) = max((ratio(s, t), t) for t in self.d)
                    self.d[s] = self.d[match]
                    print "Using Levenshtein: " + match
                    return self.d[match]
예제 #33
0
def text_similarity_score(dataset: list, refer_dataset):
    """
    no useful
    :param dataset:
    :param refer_dataset:
    :return:
    """
    similarity_score_list = []
    for sample in dataset:
        sample_refer_dataset = random.sample(refer_dataset, 20)

        similarities = [ratio(sample['essay'], refer_sample['essay']) for refer_sample in sample_refer_dataset]
        # top_k = simi.argsort()[-int(refer_matrix.shape[1]/50):][::-1].tolist()

        scores = [refer_sample['domain1_score'] for refer_sample in sample_refer_dataset]
        similarity_score = np.average([similarity * score for similarity, score in zip(similarities, scores)])
        sample['text_similarity_score'] = similarity_score
        similarity_score_list.append(similarity_score)
        # print(np.mean(similarity_score), sample['domain1_score'])

    return {'text_similarity_score': {'mean': np.mean(similarity_score_list), 'std': np.std(similarity_score_list)}}
예제 #34
0
def score_reconciliation(txn, payment):
    words = txn.payee.replace('-', ' ').split(' ')

    bankref_distances = [ratio(w, payment.bankref) for w in words]
    # Get the two best matches, for the two parts of the bankref
    bankref_score = sum(sorted(bankref_distances)[-2:])
    name_score = jaro(txn.payee, payment.user.name)

    other_score = 0.0

    if txn.amount == payment.amount:
        other_score += 0.4

    if txn.account.currency == payment.currency:
        other_score += 0.6

    # check posted against expiry?

    app.logger.debug('Scores for txn %s payment %s: %s %s %s',
                     txn.id, payment.id, bankref_score, name_score, other_score)
    return bankref_score + name_score + other_score
예제 #35
0
    def annotate(self, training_set):

        #Levenshtein distance - minimum number of single character edits
        distance_udf = udf(lambda x, y: distance(x, y), IntegerType())
        #Levenshtein ratio - similarity of two strings
        ratio_udf = udf(lambda x, y: ratio(x, y), DoubleType())
        #Jaro - similarity score
        jaro_udf = udf(lambda x, y: jaro(x, y), DoubleType())
        #Jaro-winkler - similarity score, which favors strings that match prefix from the beginning
        jaro_winkler_udf = udf(lambda x, y: jaro_winkler(x, y), DoubleType())
        #fuzz partial ratio - gives a score based on how well parts of a string match another
        fuzz_partial_ratio_udf = udf(
            lambda x, y: fuzz.partial_ratio(x, y) / 100, DoubleType())

        training_set = training_set.withColumn("distance", distance_udf("concept_name_1", "concept_name_2")) \
            .withColumn("ratio", ratio_udf("concept_name_1", "concept_name_2")) \
            .withColumn("jaro", jaro_udf("concept_name_1", "concept_name_2")) \
            .withColumn("jaro_wrinkler", jaro_winkler_udf("concept_name_1", "concept_name_2")) \
            .withColumn("fuzz_partial_ratio", fuzz_partial_ratio_udf("concept_name_1", "concept_name_2"))

        return training_set
예제 #36
0
def approximate_answers(q):
    max_score = 0
    answers = ""
    prediction = ""
    
    for idx, row in qa_datasets.iterrows():
        score = ratio(row['Question'], q)
        if score >= 0.8:
            return row['Answer'], score, row['Question']
        
        elif score > max_score:
            max_score = score
            answer = row["Answer"]
            prediction = row["Question"]

    if max_score > 0.51:
        return answer, max_score, prediction

    else:
        return "Maap aku gak ngerti kamu ngomong apa... :(", max_score, prediction
            
예제 #37
0
    def matchQuery(self, globs, text, regexList):
        score = 0
        name = self.nameU
        typeName = self.typeNameU

        for regex in regexList:
            if not regex.search(name):
                return False

        for t in globs:
            pos = name.find(t)
            if pos >= 0:
                k = max(1.0 - float(pos) / 20.0, 0.0)
                score += (k * k * 1.0 + 0.1)
            score += (ratio(t, name) * 0.1)

        if globs[0] in typeName:
            findInType = True
            score *= 1.1

        self.matchScore = score
        return score > 0
예제 #38
0
def crossref_query_title(title):
    """Contacts Crossref API for DOI of a paper

    The paper is identified by its title.
    The function retrieves the first 5 results, and searches for the one
    with maximum similarity to the original title.

    Raises an HTTPError in case of failure.

    Args:
        title: a str with the title of the paper whose DOI we are looking for
    """

    api_url = "https://api.crossref.org/works?"
    params = {"rows": "5", "query.title": title}
    url = api_url + urlencode(params, quote_via=quote_plus)
    request = Request(url)
    request.add_header(
        "User-Agent", "doi4bib utility\
                       (https://github.com/sharkovsky/doi4bib)")
    try:
        ret = urlopen(request)
        content = ret.read()
        data = json.loads(content.decode('utf-8'))
        items = data["message"]["items"]
        most_similar = EMPTY_RESULT
        for item in items:
            title = item["title"].pop()
            result = {
                "crossref_title": title,
                "similarity": ratio(title.lower(),
                                    params["query.title"].lower()),
                "doi": item["DOI"]
            }
            if most_similar["similarity"] < result["similarity"]:
                most_similar = result
        return {"success": True, "result": most_similar}
    except HTTPError as httpe:
        return {"success": False, "result": EMPTY_RESULT, "exception": httpe}
예제 #39
0
def findTorrentPage(phenny, input):
    import urllib
    from BeautifulSoup import BeautifulSoup, SoupStrainer
    from Levenshtein import ratio

    if '-' not in input.group(2):
        raise Exception()
        return

    (artist, album) = input.group(2).split('-')
    functionEnd = artist.find(' ')
    artist = artist[functionEnd + 1:].strip()
    album = album.strip()
    searchString = artist + u' - ' + album

    data = {'artistname': artist, 'groupname': album}
    data = urllib.urlencode(data)
    url = 'https://ssl.what.cd/torrents.php?action=advanced&' + data

    html = getHTML(url)

    tableStrainer = SoupStrainer(id='torrent_table')
    tableResults = BeautifulSoup(html, tableStrainer)

    firstArtist = tableResults.find('a', href=re.compile('artist.php\?id\='))

    if firstArtist is None:
        raise SyntaxError()
        return

    foundArtist = firstArtist.string
    foundAlbum = firstArtist.next.next.next.string

    foundString = foundArtist + ' - ' + foundAlbum
    searchRatio = ratio(searchString.lower(), foundString.lower())

    return ('https://ssl.what.cd/' + tableResults.find(
        'a', href=re.compile('torrents.php\?id\='))['href'], searchRatio)
예제 #40
0
def score_reconciliation(txn, payment):
    words = list(filter(None, re.split('\W+', txn.payee)))

    bankref_parts = [payment.bankref[:4], payment.bankref[4:]]
    bankref_distances = [ratio(w, p) for w in words for p in bankref_parts]
    # Get the two best matches, for the two parts of the bankref
    # A match gives 1.0, a 2-char substring 0.666, and a 6-char superstring 0.857
    bankref_score = sum(sorted(bankref_distances)[-2:])
    name_score = jaro(txn.payee, payment.user.name)

    other_score = 0.0

    if txn.amount == payment.amount:
        other_score += 0.4

    if txn.account.currency == payment.currency:
        other_score += 0.6

    # check posted against expiry?

    app.logger.debug('Scores for txn %s payment %s: %s %s %s', txn.id,
                     payment.id, bankref_score, name_score, other_score)
    return bankref_score + name_score + other_score
    def compare(self, statement, other_statement):
        """
        Compare the two input statements.

        :return: The percent of similarity between the text of the statements.
        :rtype: float
        """
        import sys

        # Use python-Levenshtein if available
        from Levenshtein import ratio

        # Return 0 if either statement has a falsy text value
        if not statement.text or not other_statement.text:
            return 0

        # Get the lowercase version of both strings
        statement_text = str(statement.text.lower())
        other_statement_text = str(other_statement.text.lower())

        similarity = ratio(statement_text, other_statement_text)

        return similarity
예제 #42
0
def get_score(query, text):
    """
    Uses Levenshtein's algorithm + some improvements to the score
    :returns: number between 0 and 100
    """
    if not query or not text:
        return 0

    query = query.lower()
    text = text.lower()
    score = ratio(query, text) * 100

    # increase score if a word from text starts with a query
    for text_part in text.split(' '):
        if text_part.startswith(query):
            score += 30
            break

    # increase score if each separate group in indexes is a beginning of a word in text
    # example for query 'fiwebr' groups 'fi', 'we', and 'br' are matching word beginnings
    # of text 'Firefox Web Browser'
    # increase score for each such group
    increment = 10
    i = 0  # query iterator
    lq = len(query)
    for j, char in enumerate(text):
        # run until query ends and check if a query char. equals to current text char
        if i < lq and query[i] == char:
            # if char from query matches beginning of text or beginning of a word inside the text, increase the score
            if j == 0 or text[j - 1] in ' .(-_+)':
                score += increment
            i += 1
        elif i == lq:
            break

    return min(100, score)
예제 #43
0
def get_publication(title):
    EMPTY_RESULT = {
        "crossref_title": "",
        "similarity": 0,
        "doi": ""
    }

    api_url = "https://api.crossref.org/works?"
    params = {"rows": "5", "query.bibliographic": title}
    url = api_url + urlencode(params, quote_via=quote_plus)

    request = Request(url)
    request.add_header("User-Agent",
                       "OpenAPC DOI Importer (https://github.com/OpenAPC/openapc-de/blob/master/python/import_dois.py; mailto:[email protected])")
    full_data = None
    try:
        ret = urlopen(request)
        content = ret.read()
        data = json.loads(content)
        items = data["message"]["items"]
        most_similar = EMPTY_RESULT
        for item in items:
            if "title" not in item:
                continue
            title = item["title"].pop()
            result = {
                "crossref_title": title,
                "similarity": ratio(title.lower(), params["query.bibliographic"].lower()),
                "doi": item["DOI"]
            }
            if most_similar["similarity"] < result["similarity"]:
                most_similar = result
                full_data = item
        return {"success": True, "result": most_similar, "crossref": full_data}
    except HTTPError as httpe:
        return {"success": False, "result": EMPTY_RESULT, "exception": httpe}
예제 #44
0
def MatchStops():
    dft = pd.read_csv('./Data/turnstile_191026.txt')
    dfs = pd.read_csv('./Data/stops.txt')
    turnstile_stop_names = []
    stop_stop_names = []
    matches = []
    for x in sorted(dft['STATION'].unique()):
        stop_name = x
        lower_stop_name = x.lower()
        turnstile_stop_names.append((lower_stop_name, stop_name))
    for x in sorted(dfs['stop_name'].unique()):
        stop_id = dfs['stop_id'][dfs['stop_name'] == x]
        stop_stop_names.append((x.lower(), stop_id[stop_id.index[0]]))
    for turn in turnstile_stop_names:
        ratio_list = []
        for stop in stop_stop_names:
            ratio_list.append(
                (stop[0], stop[1], turn[0], turn[1], ratio(turn[0], stop[0])))
        best_match = sorted(ratio_list, key=lambda x: x[-1])[-1]
        if best_match[-1] < 0.7:
            print('{}-->{}'.format(turn[1], best_match))
        else:
            matches.append((turn[1], best_match[1]))
    pd.DataFrame(matches, columns=['STATION', 'stop_id']).to_csv('test.csv')
예제 #45
0
 def quick_ratio(self):
     # This is usually quick enough :o)
     if not self._ratio:
         self._ratio = ratio(self._str1, self._str2)
     return self._ratio
def levDistanceCompare(str1, str2, value):
	if ratio(str1, str2) > value:
		return True
	else:
		return False
예제 #47
0
def populate_pokeset(pokeset, skip_ev_check=False):
    """
    Reads in data for one pokeset and populates it with all additionally available
    data. This includes types of Pokémon or per-move data like PP, power or types.

    Arguments:
        pokeset: base data of the set to populate. see the format specification for details.
        skip_ev_check: Defaults to False. If True, allows illegal movesets (produces a
                       warning instead of an error)
    Throws:
        ValueError: If the data is not fully parsable. the ValueError's description contains
        further details on the error that occured.
    Returns:
        The populated set. The passed data is not modified
    """
    # I am sorry that this function is so big and partly copy-pasted,
    # but it just does a lot of equally boring things like processing
    # special cases. I couldn't come up with a structure that wouldn't
    # just feel forced. It could be better, but it could also be worse,
    # and to be honest it's easy enough to maintain (for me at least).

    # make deepcopy to not modify original data
    pokeset = deepcopy(pokeset)

    # check if there are wrongly capitalized keys
    for key, value in list(pokeset.items()):
        key_lower = key.lower()
        if key_lower != key:
            warn("Key should be all lowercase: %s" % key)
            del pokeset[key]
            pokeset[key_lower] = value

    # check that all obligatory fields are present
    present_fields = set(pokeset.keys())
    missing_fields = _OBLIGATORY_FIELDS - present_fields
    if missing_fields:
        raise ValueError("pokeset is missing obligatory fields: %s" %
                         ", ".join(missing_fields))

    # check if there are unknown fields
    unrecognized_fields = present_fields - (set(_OPTIONAL_FIELDS.keys())
                                            | _OBLIGATORY_FIELDS)
    if unrecognized_fields:
        raise ValueError("pokeset has unrecognized fields: %s" %
                         ", ".join(unrecognized_fields))

    # trim all leading and trailing whitespaces
    # TODO test if the yaml parser already does this
    #for k, v in pokeset.items():
    #    pokeset[k] = v.strip()

    # fill in optional fields
    for key, default in _OPTIONAL_FIELDS.items():
        if key not in pokeset:
            pokeset[key] = deepcopy(default)

    # check validity of names
    if not pokeset["setname"] or not isinstance(pokeset["setname"], str):
        raise ValueError("setname must be a non-empty string")
    custom_displayname = False
    if pokeset["displayname"] is not None:
        custom_displayname = True
        if not pokeset["displayname"] or not isinstance(
                pokeset["displayname"], str):
            raise ValueError("displayname, if set, must be a non-empty string")

    # check and populate species
    species_raw = pokeset["species"]
    if species_raw is None:
        raise ValueError("Invalid species: %s" % (species_raw, ))
    species, perfect_match = _get_by_index_or_name(gen4data.POKEDEX,
                                                   species_raw, "species",
                                                   gen4data.get_pokemon,
                                                   gen4data.find_pokemon)
    if not perfect_match:
        warn("Didn't recognize species %s, but assumed %s." %
             (species_raw, species["name"]))
    pokeset["species"] = species

    # check tags
    tags = pokeset["tags"]
    if not isinstance(tags, list) or not all(
            isinstance(tag, str) for tag in tags):
        raise ValueError("tags must be a list of strings")
    pokeset["tags"] = tags

    # replace None-default for ingamename
    if pokeset["ingamename"] is None:
        pokeset["ingamename"] = species["name"].upper()
        if pokeset["shiny"]:
            pokeset["ingamename"] = pokeset["ingamename"][:8] + "-S"
    # check length of ingamename
    if not 1 <= len(pokeset["ingamename"]) <= 10:
        raise ValueError(
            "ingamename must be between 1 and 10 characters long: %s" %
            pokeset["ingamename"])

    # check happiness
    if not isinstance(pokeset["happiness"], int):
        raise ValueError("happiness must be a number.")

    # check and populate ability. is a list
    ability = []
    ability_raw = pokeset["ability"]
    if not isinstance(ability_raw, list):
        ability_raw = [ability_raw]
    if not ability_raw:
        raise ValueError("Ability cannot be an empty list.")
    for ability_raw_single in ability_raw:
        ability_single, perfect_match = _get_by_index_or_name(
            gen4data.ABILITIES, ability_raw_single, "ability",
            gen4data.get_ability, gen4data.find_ability)
        if not perfect_match:
            warn("Didn't recognize ability %s, but assumed %s." %
                 (ability_raw_single, ability_single["name"]))
        ability.append(ability_single)
    if len(set(a["id"] for a in ability)) < len(ability):
        raise ValueError("All abilities supplied must be unique: %s" %
                         ", ".join(a["name"] for a in ability))
    pokeset["ability"] = ability

    # check and populate item. is a list
    item = []
    item_raw = pokeset["item"]
    if not isinstance(item_raw, list):
        item_raw = [item_raw]
    if not item_raw:
        raise ValueError("Item cannot be an empty list.")
    for item_raw_single in item_raw:
        item_single, perfect_match = _get_by_index_or_name(
            gen4data.ITEMS, item_raw_single, "item", gen4data.get_item,
            gen4data.find_item)
        if not perfect_match:
            warn("Didn't recognize item %s, but assumed %s." %
                 (item_raw_single, item_single["name"]))
        item.append(item_single)
    if len(set(i["id"] for i in item)) < len(item):
        raise ValueError("All items supplied must be unique: %s" %
                         ", ".join(i["name"] for i in item))
    pokeset["item"] = item

    # check and populate ball. is a list
    ball = []
    ball_raw = pokeset["ball"]
    if not isinstance(ball_raw, list):
        ball_raw = [ball_raw]
    if not ball_raw:
        raise ValueError("Ball cannot be an empty list.")
    for ball_raw_single in ball_raw:
        ball_single, perfect_match = _get_by_index_or_name(
            gen4data.ITEMS, ball_raw_single, "ball", gen4data.get_ball,
            gen4data.find_ball)
        if not ball_single["name"].endswith(" Ball"):
            raise ValueError("Invalid ball: %s" % ball_single)
        if not perfect_match:
            warn("Didn't recognize ball %s, but assumed %s." %
                 (ball_raw_single, ball_single["name"]))
        ball.append(ball_single)
    if len(set(b["name"] for b in ball)) < len(ball):
        raise ValueError("All balls supplied must be unique: %s" %
                         ", ".join(b["name"] for b in ball))
    pokeset["ball"] = ball

    # check gender
    gender = pokeset["gender"]
    if not isinstance(gender, list):
        gender = [gender]
    for gender_single in gender:
        if gender_single not in ("m", "f", None):
            raise ValueError(
                "gender can only be 'm', 'f' or not set (null), but not %s" %
                (gender_single, ))
    if len(gender) > 1 and None in gender:
        raise ValueError("non-gender cannot be mixed with m/f")
    if len(set(gender)) < len(gender):
        raise ValueError("All genders supplied must be unique: %s" %
                         ", ".join(gender))
    pokeset["gender"] = gender

    # check level
    level = pokeset["level"]
    if not (isinstance(level, int) and 1 <= level <= 100):
        raise ValueError("level must be a number between 1 and 100")

    # check and populate nature. might be defined as "+atk -def" or similar
    nature_raw = pokeset["nature"]
    if not isinstance(nature_raw, str):
        raise ValueError("Invalid nature: %s" % (nature_raw, ))
    stats_regex = "|".join(stats.statnames)
    match = re.match(r"^\+({0})\s+-((?:\1){0})$".format(stats_regex),
                     nature_raw)
    if match:
        increased = match.group(1)
        decreased = match.group(2)
        matching_nature = [
            n for n in gen4data.NATURES
            if n["increased"] == increased and n["decreased"] == decreased
        ]
        if matching_nature:
            nature_raw = matching_nature[0]["name"]
    nature, perfect_match = _get_by_index_or_name(gen4data.NATURES, nature_raw,
                                                  "nature",
                                                  gen4data.get_nature,
                                                  gen4data.find_nature)
    if not perfect_match:
        warn("Didn't recognize nature %s, but assumed %s." %
             (nature_raw, nature["name"]))
    pokeset["nature"] = nature

    # check IVs
    ivs = pokeset["ivs"]
    if isinstance(ivs, int):
        ivs = {name: ivs for name in stats.statnames}
    if not isinstance(ivs, dict):
        raise ValueError("Invalid IVs: %s" % (ivs, ))
    if set(stats.statnames) != set(ivs.keys()):
        raise ValueError("ivs must contain the following keys: %s" %
                         ", ".join(stats.statnames))
    if not all(isinstance(v, int) for v in ivs.values()):
        raise ValueError("Invalid IV value in IVs: %s" % (ivs, ))
    if not all(0 <= val <= 31 for val in ivs.values()):
        raise ValueError("All IVs must be between 0 and 31.")
    pokeset["ivs"] = ivs
    # check EVs
    evs = pokeset["evs"]
    if isinstance(evs, int):
        evs = {name: evs for name in stats.statnames}
    if not isinstance(evs, dict):
        raise ValueError("Invalid EVs: %s" % (evs, ))
    if set(stats.statnames) != set(evs.keys()):
        raise ValueError("evs must contain the following keys: %s" %
                         ", ".join(stats.statnames))
    if not all(isinstance(v, int) for v in evs.values()):
        raise ValueError("Invalid EV value in EVs: %s" % (evs, ))
    if not all(0 <= val for val in evs.values()):
        raise ValueError("All EVs must be >= 0.")
    if not all(val <= 252 for val in evs.values()):
        message = "All EVs must be <= 252."
        if skip_ev_check:
            warn(message)
        else:
            raise ValueError(message)
    ev_sum = sum(val for val in evs.values())
    if ev_sum > 510:
        message = "Sum of EV must not be larger than 510, but is %d" % ev_sum
        if skip_ev_check:
            warn(message)
        else:
            raise ValueError(message)
    for key, value in evs.items():
        if value % 4 != 0:
            warn(
                "EV for %s is %d, which is not a multiple of 4 (wasted points)"
                % (key, value))
    pokeset["evs"] = evs

    # TODO outsorce singular move procession
    # check and populate moves
    moves = []
    moves_raw = pokeset["moves"]
    if not 1 <= len(moves_raw) <= 4:
        raise ValueError(
            "Pokémon must have between 1 and 4 moves, but has %d" %
            len(moves_raw))
    for move_raw in moves_raw:
        move = []
        if not isinstance(move_raw, list):
            move_raw = [move_raw]
        for move_raw_single in move_raw:
            pp = None
            pp_ups = 0
            # move might have pp-up and fixed pp information
            pp_info = re.search(r"\(\+\d+\)|\(=\d+\)|\(\+\d+/=\d+\)$",
                                move_raw_single)
            if pp_info:
                move_raw_single = move_raw_single[:pp_info.start() - 1]
                for bit in pp_info.group(0).strip("()").split("/"):
                    if bit.startswith("+"):
                        pp_ups = int(bit[1:])
                    elif bit.startswith("="):
                        pp = int(bit[1:])
            move_single, perfect_match = _get_by_index_or_name(
                gen4data.MOVES, move_raw_single, "move", gen4data.get_move,
                gen4data.find_move)
            if not perfect_match:
                warn("Didn't recognize move %s, but assumed %s." %
                     (move_raw_single, move_single["name"]))
            move_single["pp_ups"] = pp_ups
            pp = pp or move_single["pp"]
            pp = int(pp * (1 + 0.2 * pp_ups))
            move_single["pp"] = pp
            move.append(move_single)
        moves.append(move)
    pokeset["moves"] = moves

    # check rarity
    rarity = pokeset["rarity"]
    if not (isinstance(rarity, (int, float)) and rarity >= 0.0):
        raise ValueError("rarity must be a number greater or equal to 0.0")
    if rarity > 10.0:
        warn(
            "rarity is %d, which is surprisingly high. Note that 1.0 is the default "
            "and high values mean the Pokémon gets chosen more often." %
            rarity)

    # fix default biddable value
    if pokeset["biddable"] is None:
        pokeset["biddable"] = not pokeset["shiny"]
    if not isinstance(pokeset["biddable"], bool):
        raise ValueError("biddable must be a boolean (true or false), not %s" %
                         type(pokeset["biddable"]))

    # fix default hidden value
    if pokeset["hidden"] is None:
        pokeset["hidden"] = pokeset["shiny"]
    if not isinstance(pokeset["hidden"], bool):
        raise ValueError("hidden must be a boolean (true or false), not %s" %
                         type(pokeset["hidden"]))

    if pokeset["biddable"] and pokeset["hidden"]:
        warn("Set is biddable, but also hidden, which doesn't make sense.")
    if pokeset["shiny"] and not pokeset["hidden"]:
        warn("Set is shiny, but not hidden, which means it is not secret "
             "and usable in token matches at any time. Is this intended?")

    # fix displayname
    if pokeset["displayname"] is None:
        pokeset["displayname"] = pokeset["species"]["name"]
        # formnames get handled below

    # check form
    form = pokeset["form"]
    if not isinstance(form, int):
        if not isinstance(form, str):
            raise ValueError("form must be a formnumber or a string, not %s" %
                             type(form))
        formnumber = forms.get_formnumber(species["id"], form)
        if formnumber is None:
            raise ValueError("Unrecognized form %s for species %s" %
                             (form, species["name"]))
        form = formnumber
    pokeset["form"] = form
    formname = forms.get_formname(species["id"], form)
    if formname is None and form != 0:
        raise ValueError("Species %s has no form %s." %
                         (species["name"], form))

    # special case: all forms. fix displayname
    formname = forms.get_formname(species["id"], form)
    if formname and not custom_displayname:
        pokeset["displayname"] += " " + formname

    # special case: Deoxys. Fix basestats (displayname already fixed)
    if species["name"] == "Deoxys":
        deoxys_form = forms.get_formname(species["id"], form)
        species["basestats"] = gen4data.DEOXYS_BASESTATS[deoxys_form]

    # special case: Arceus. Handle as form. Also fix type
    if species["name"] == "Arceus":
        item = pokeset["item"]
        if len(item) > 1:
            raise ValueError("Arceus currently must have a fixed item")
        arceus_type = forms.get_multitype_type(item[0])
        pokeset["species"]["types"] = [arceus_type]
        if not custom_displayname:
            pokeset["displayname"] += " " + arceus_type
        #pokeset["form"] = gen4data.TYPES.index(arceus_type)

    # special case: Wormadam. Fix type
    if species["name"] == "Wormadam":
        wormadam_types = ("Grass", "Ground", "Steel")
        pokeset["species"]["types"] = ["Bug", wormadam_types[form]]

    # add stats
    pokeset["stats"] = {}
    for statname in stats.statnames:
        basestat = species["basestats"][statname]
        ev = evs[statname]
        iv = ivs[statname]
        level = pokeset["level"]
        pokeset["stats"][statname] = stats.calculate_stat(
            basestat, ev, iv, statname, nature, level)

    # special case: Shedinja. Always 1 HP
    if species["name"] == "Shedinja":
        pokeset["stats"]["hp"] = 1

    # add shininess to display name
    if pokeset["shiny"] and not custom_displayname:
        pokeset["displayname"] += " (Shiny)"

    # add autogenerated tags
    if pokeset["biddable"]:
        pokeset["tags"].append("biddable")
    if pokeset["hidden"]:
        pokeset["tags"].append("hidden")
    if pokeset["shiny"]:
        pokeset["tags"].append("shiny")
    pokeset["tags"].append("species+%d" % pokeset["species"]["id"])
    pokeset["tags"].append("species+%s" %
                           normalize_name(pokeset["species"]["name"]))
    for type_ in pokeset["species"]["types"]:
        pokeset["tags"].append("type+%s" % type_.lower())
    pokeset["tags"].append("level+%d" % pokeset["level"])
    pokeset["tags"].append("form+%d" % pokeset["form"])

    for ability_ in pokeset["ability"]:
        if ability_:
            pokeset["tags"].append("ability+%s" %
                                   normalize_name(str(ability_["name"])))
    pokeset["tags"].append("setname+%s" % normalize_name(pokeset["setname"]))
    if pokeset["rarity"] > 0:
        pokeset["tags"].append("matchmaker-enabled")

    # ensure no duplicate tags
    pokeset["tags"] = sorted(set(pokeset["tags"]))

    # check combinations and separations
    combinations = pokeset["combinations"]
    if not isinstance(combinations, list) or not all(
            isinstance(c, list) for c in combinations):
        raise ValueError("combinations must be a list of lists.")
    if not all(isinstance(s, str) or s is None for s in chain(*combinations)):
        raise ValueError("combination items must be strings or null")
    separations = pokeset["separations"]
    if not isinstance(separations, list) or not all(
            isinstance(s, list) for s in separations):
        raise ValueError("separations must be a list of lists.")
    if not all(isinstance(s, str) or s is None for s in chain(*separations)):
        raise ValueError("separation items must be strings or null")
    movenames = sum([movelist for movelist in pokeset["moves"]], [])
    movenames = list(set(move["name"] for move in movenames))
    all_things = (movenames + [p["name"] for p in pokeset["item"]] +
                  [a["name"] for a in pokeset["ability"]])
    ambiguities = set(item for item, count in Counter(all_things).items()
                      if count > 1)
    all_things = set(all_things)
    for com in combinations:
        if any(c in ambiguities for c in com):
            raise ValueError(
                "Can't use %s in combinations, as it is ambiguous." % (com, ))
        rest = set(com) - all_things
        for r in list(rest):
            if not r:
                continue
            for thing in all_things - {None}:
                if ratio(thing.lower(), r.lower()) > 0.9:
                    if is_difference_significant(thing, r):
                        warn("Didn't recognize combination %s, but assumed %s."
                             % (r, thing))
                    rest.remove(r)
                    com.remove(r)
                    com.append(thing)
                    break
        if rest:
            raise ValueError(
                "All things referenced in combination must be present in set. Missing: %s"
                % ", ".join(rest))
    for sep in separations:
        if any(s in ambiguities for s in sep):
            raise ValueError(
                "Can't use %s in separations, as it is ambiguous." % (sep, ))
        rest = set(sep) - all_things
        for r in list(rest):
            if not r:
                continue
            for thing in all_things - {None}:
                if ratio(thing.lower(), r.lower()) > 0.9:
                    if is_difference_significant(thing, r):
                        warn(
                            "Didn't recognize separation %s, but assumed %s." %
                            (r, thing))
                    rest.remove(r)
                    sep.remove(r)
                    sep.append(thing)
                    break
        if rest:
            raise ValueError(
                "All things referenced in separation must be present in set. Missing: %s"
                % ", ".join(rest))
    # TODO validate that the combinations and separations even allow for a functioning set to be generated
    return pokeset
예제 #48
0
파일: engine.py 프로젝트: pksmall/EliteOCR
    def cleanCommodities(self, data, levels):
        for i in xrange(len(data)):
            if not data[i][0] is None:
                mindist = 100
                topcomm = ""
                alternatives = []
                for comm in self.comm_list:
                    #print data[i][0].value
                    #print unicode(comm)
                    dist = distance(unicode(data[i][0].value), unicode(comm))
                    if dist < 7:
                        alternatives.append((unicode(comm), dist))
                    if dist < mindist:
                        mindist = dist
                        topcomm = comm
                    if dist == 0:
                        data[i][0].value = topcomm
                        data[i][0].confidence = 1.0
                        break
                #print unicode(data[i][0].value)
                #print topcomm
                #print
                alternatives.sort(key=lambda x: x[1])
                optional_values = [j[0] for j in alternatives]

                maxdist = 4
                if len(data[i][0].value) < 5:
                    maxdist = 3

                if mindist < maxdist:
                    data[i][0].value = topcomm
                    if mindist < 2:
                        data[i][0].confidence = 1.0
                    else:
                        data[i][0].confidence = 0.7
                    if mindist != 0:
                        data[i][0].optional_values = [data[i][0].value
                                                      ] + optional_values
                else:
                    data[i][0].confidence = 0.0
                    data[i][0].optional_values = [data[i][0].value
                                                  ] + optional_values
            # LOW MED HIGH
            if not data[i][4] is None and levels:
                try:
                    topratio = 0.0
                    toplev = ""
                    for lev in self.levels[self.lang]:
                        if data[i][4].value is None:
                            print "None!"
                        rat = ratio(unicode(data[i][4].value), unicode(lev))
                        if rat > topratio:
                            topratio = rat
                            toplev = lev
                    data[i][4].value = toplev
                except:
                    pass
            if not data[i][6] is None and levels:
                try:
                    topratio = 0.0
                    toplev = ""
                    for lev in self.levels[self.lang]:
                        rat = ratio(data[i][6].value, unicode(lev))
                        if rat > topratio:
                            topratio = rat
                            toplev = lev
                    data[i][6].value = toplev
                except:
                    pass
예제 #49
0
def findBestMatch(needle, haystack):
    return max(haystack, key=lambda x: ratio(needle, x))
예제 #50
0
def similar(string, array):
    for i in array:
        return ratio(string, i)
예제 #51
0
for quote in quotes:
    quote_len = len(quote)
    if quote_len > greatest_length:
        greatest_length = quote_len
    if quote_len < least_length:
        least_length = quote_len

for comment in reddit.subreddit(
        'AskReddit+movies+funny+pics').stream.comments():
    text = comment.body.lower()
    len_text = len(text)
    if len_text + 7 > greatest_length or len_text - 4 < least_length:
        continue

    greatest = 0
    best_quote = ''
    print(text)

    for quote in quotes:
        value = ratio(text, quote)
        if value > .75:
            print(text, quote)
        """
        if value > greatest:
            greatest = value
            best_quote = quote
        """

    # print(greatest, text, best_quote)
예제 #52
0
def edit_dist_of(sent0, sent1, item):
    x, y = item
    return ratio(sent0[x], sent1[y])
예제 #53
0
def similarity_ratios(l):
    ratios = {}
    for i in range(len(l) - 1):
        for j in range(i + 1, len(l)):
            ratios[(l[i], l[j])] = ratio(l[i], l[j])
    return ratios
예제 #54
0
def comparar_cad(cadena1, cadena2):
    if ratio(cadena1, cadena2) >= 0.8: return True
    return False
예제 #55
0
def apply_ratio(col1, col2):
    return ratio(col1, col2)
예제 #56
0
csmar_codes = pd.read_csv(os.path.join(base_dir, "CSMAR_Codes.csv"))
bhc_codes = pd.read_csv(os.path.join(base_dir, "BHC_Codes.csv"))

csmar_codes
bhc_codes

csmar_codes['CSMAR Variable Description']

bhc_codes['Variable Description']

from scipy.spatial.distance import cdist
from Levenshtein import ratio

arr1 = np.array(csmar_codes['CSMAR Variable Description'])
arr2 = np.array(bhc_codes['Variable Description'])

matrix = cdist(arr2.reshape(-1, 1), arr1.reshape(-1, 1),
               lambda x, y: ratio(x[0], y[0]))
df = pd.DataFrame(data=matrix, index=arr2, columns=arr1)
df_sim = df.transpose()

sim_score = .7
for ind in range(0, len(df_sim.index)):

    if len(df_sim.iloc[ind, :][df_sim.iloc[ind, :] > sim_score]) > 0:
        print("CSMAR index:", df_sim.index[ind])
        print(df_sim.iloc[ind, :][df_sim.iloc[ind, :] > sim_score])

#Get relevant BHC and CSMAR Data.
예제 #57
0
 def ratio(self):
     if not self._ratio:
         self._ratio = ratio(self._str1, self._str2)
     return self._ratio
예제 #58
0
def compare(string1, string2):
    '''
	compares two strings on char similarity with levenshtein ratio
	returns a value between 0 and 1, 0 no overlap, 1 complete overlap
	'''
    return ratio(string1, string2)
예제 #59
0
파일: MAD.py 프로젝트: tallemeersch/MAD
def levenshtein(pair_of_sentences):
    l = ratio(pair_of_sentences[0].lower(), pair_of_sentences[1].lower())
    assert l >= 0 and l <= 1
    return l
예제 #60
0
def test_levenshtein_ratio():
    expected_ratio = ratio(normalize_input(TORONTO.name), KEY)
    actual_ratio = levenshtein._ratio(TORONTO.name, KEY)

    assert expected_ratio == actual_ratio