def check_words_equal(word1, word2): #remove punctuation from word1 word1_mod = word1.translate(str.maketrans("", "", string.punctuation)) word2_mod = word2.translate(str.maketrans("", "", string.punctuation)) d_meta = fuzzy.DMetaphone() #fuzzy match return d_meta(word1_mod) == d_meta(word2)
def dMetaphone(collection): """ Returns a list of metaphone encoded collection. Arguments: collection -- the list of words to be encoded using metaphone. limit -- the limit to the words. """ try: assert type(collection) == list or type(collection) == str except AssertionError: print("The collection for metaphone is not a string or a list.") import fuzzy dmetaphone = fuzzy.DMetaphone() if type(collection) == str: return dmetaphone(collection) collectionEncoded = list() for word in collection: wordEncoded = dmetaphone(word) if wordEncoded[0] is not None: wordEncoded[0] = wordEncoded[0].decode('UTF-8') if wordEncoded[1] is not None: wordEncoded[1] = wordEncoded[1].decode('UTF-8') collectionEncoded.append(wordEncoded) return collectionEncoded
def get_index_keys(content, add=True): # Very simple word-based parser. We skip stop words and single # character words. words = NON_WORDS.sub(' ', content.lower()).split() words = [word.strip("'") for word in words] words = [word for word in words if word not in STOP_WORDS and len(word) > 1] if use_stem: stemmer = Stemmer.Stemmer('english') words = stemmer.stemWords(words) if use_metaphone: dmeta = fuzzy.DMetaphone() import itertools w = [] [ w.extend(list(itertools.chain(dmeta(word)))) for word in words] words = filter (lambda a: a != None, w) if not add: return words # Calculate the TF portion of TF/IDF. counts = collections.defaultdict(float) for word in words: counts[word] += 1 wordcount = len(words) tf = dict((word, count / wordcount) for word, count in counts.iteritems()) return tf
def get_suggestion(word, tree, meta_dict): dmeta = fuzzy.DMetaphone() words_list = tree.query(word, 1) words_list1 = [] words_list2 = [] # for removing the edit distance value present in wordList for i in range(0, len(words_list)): words_list1.append(words_list[i][1]) dmeta_result = dmeta(word) if dmeta_result[0] is not None: key1 = dmeta_result[0] try: words_list2 = meta_dict[key1] except KeyError: pass if dmeta_result[1] is not None: key2 = dmeta_result[1] try: words_list2.extend(meta_dict[key2]) except KeyError: pass # Find intersection of the two list words_list3 = list(set(words_list1) & set(words_list2)) return [words_list1, words_list2, words_list3]
def compile_people(source, playing, games): playing = pd.merge(playing, games[['key', 'league']], left_on='game.key', right_on='key') playing['league'] = playing['league'].apply( lambda x: x + " League" if "League" not in x and "Association" not in x else x) playing['year'] = playing['game.date'].str.split("-").str[0] playing['B_G'] = 1 for pos in ['p', 'c', '1b', '2b', '3b', 'ss', 'lf', 'cf', 'rf']: playing['F_%s_G' % pos.upper()] = playing['pos'].apply(lambda x: ( 1 if pos in x.split("-") else 0) if not pd.isnull(x) else 0) playing['F_OF_G'] = playing['F_LF_G'] | playing['F_CF_G'] | \ playing['F_RF_G'] playing['P_G'] = playing['F_P_G'] playing['name.first'] = playing['name.first'].fillna("") playing = playing[playing['name.last'] != "TOTALS"] grouper = playing.groupby( ['year', 'league', 'name.last', 'name.first', 'club.name']) df = grouper.sum() df = pd.merge(df, grouper[['game.date' ]].min().rename(columns={'game.date': 'S_FIRST'}), left_index=True, right_index=True) df = pd.merge(df, grouper[['game.date' ]].max().rename(columns={'game.date': 'S_LAST'}), left_index=True, right_index=True) df.reset_index(inplace=True) df['metaphone'] = df['name.last'].apply(lambda x: fuzzy.DMetaphone() (x.split("[")[0])[0].ljust(4, 'Z')) df['metaseq'] = df.groupby(['year', 'league', 'metaphone']).cumcount() + 1 df['metacount'] = df.groupby(['year', 'league', 'metaphone'])['metaseq'].transform('count') df['person.ref'] = df.apply(lambda x: "%s%02d%02d" % (x.metaphone, x.metaseq, x.metacount), axis=1) df.rename(inplace=True, columns={ 'name.last': 'person.name.last', 'name.first': 'person.name.given', 'club.name': 'entry.name', 'year': 'league.year', 'league': 'league.name' }) df = df[[ 'league.year', 'league.name', 'person.ref', 'person.name.last', 'person.name.given', 'entry.name', 'S_FIRST', 'S_LAST', 'B_G', 'P_G', 'F_1B_G', 'F_2B_G', 'F_3B_G', 'F_SS_G', 'F_OF_G', 'F_LF_G', 'F_CF_G', 'F_RF_G', 'F_C_G', 'F_P_G' ]] df.to_csv("processed/%s/people.csv" % source, index=False, float_format='%d')
def __init__(self, word_set, unigrams, k, costs=None, lamda=1, alphabet='abcdefghijklmnopqrstuvwxyz'): # Initialize alphabet self.alphabet = alphabet # Store all known words self.dict_words = word_set # Build and store valid prefixes self.valid_prefixes = set([]) for word in self.dict_words: for i in range(len(word) + 1): self.valid_prefixes.add(word[:i]) # Weighting likelihood & prior self.lamda = lamda # Store unigram probabilities - Use Laplace Add-k Smoothing for log probabilities self.priors = {} self.k = k self.N = sum( (count for word, count in unigrams)) + k * len(unigrams) + k for word, count in unigrams: self.priors[word] = math.log(float(count + k) / self.N) # Edit Distance Costs if costs != None: self.insert_costs = costs['ins_costs'] self.delete_costs = costs['del_costs'] self.substitute_costs = costs['sub_costs'] self.transpose_costs = costs['trans_costs'] else: self.insert_costs = np.ones((128, )) self.delete_costs = np.ones((128, )) self.transpose_costs = np.ones((128, 128)) self.substitute_costs = np.ones((128, 128)) # Build phonetic index - Double Metaphone self.dmeta = fuzzy.DMetaphone() self.phonetic_buckets = {} for word in self.dict_words: phonetic_idx = self.dmeta(word) if phonetic_idx[0] not in self.phonetic_buckets: self.phonetic_buckets[phonetic_idx[0]] = [] self.phonetic_buckets[phonetic_idx[0]].append(word) if phonetic_idx[1] != None: if phonetic_idx[1] not in self.phonetic_buckets: self.phonetic_buckets[phonetic_idx[1]] = [] self.phonetic_buckets[phonetic_idx[1]].append(word)
def get_levenshtein_phonetic_similarity(osm_name, source_name): dmeta = fuzzy.DMetaphone() try: dmeta_osm = dmeta(osm_name)[0] #.decode("utf-8") dmeta_source = dmeta(source_name)[0] #.decode("utf-8") return Levenshtein.ratio(dmeta_osm, dmeta_source) except Exception as err: return None
def phonetic_equal(str1, str2): """Check if two strings are equal when the substrings (splitted by _) are permuated""" if "fuzzy" in dir(): return False dm = fuzzy.DMetaphone(4) if [dm(x) for x in str1.split("_")] == [dm(x) for x in str2.split("_")]: return True
def compare(input_list, keywords_dictionary, word_weights): # Load phonetics functions dmeta = fuzzy.DMetaphone() metaphone = lambda x: dmeta(x)[0] soundex = fuzzy.Soundex(4) phonetics_methods = [metaphone, soundex] # initiate empty dictionary for scores scores = {} # Iterate through methods for solving, then iterate through words in # scrubbed user input. For each word, compare phonetics to all keywords # and add score to the scores dictionary. After, do normal QWERTY and LD # analyses for method, keywords in keywords_dictionary.iteritems(): scores[method] = 0 # print(method) # Phonetic Scoring methods for phonetic in phonetics_methods: formatted_array = np.asarray(map(phonetic, keywords)) for word in input_list: formatted_word = phonetic(word) dist_array = \ normalized_damerau_levenshtein_distance_withNPArray( formatted_word, formatted_array) dist = min(dist_array) # Handle cases where "not" was found within the input - add to # scores dictionary. weight = word_weights.get(word) if word_weights.get( word) else 1 scores[method] += weight * math.sqrt(dist) # For QWERTY and Damerau-Levenshtein distances, calcuate the differences for word in input_list: # Do QWERTY Keyboard analysis dist_array = normalized_keyboard_word_distance_withNPArray( word, keywords) dist = min(dist_array) # handle weighting for position from "not" weight = word_weights.get(word) if word_weights.get(word) else 1 scores[method] += weight * math.sqrt(dist) # Do normal LD analysis dist_array = normalized_damerau_levenshtein_distance_withNPArray( word, np.asarray(keywords)) dist = min(dist_array) weight = word_weights.get(word) if word_weights.get(word) else 1 scores[method] += weight * math.sqrt(dist) return scores
def generateMetaphoneHash(self, dictionary, table=None): metaphoneHash = {} if table is None else table for name, gender in dictionary.iteritems(): name = self._sanitizeName(name) if len(name) > 1: metaphonehash = fuzzy.DMetaphone()(name) self._appendToDict(metaphonehash, gender, metaphoneHash) return metaphoneHash
def _get_name_sound(self, name): """ Convert a name to its 'sound'. We use the 'fuzzy' module, which offers different algorithsm to find the phontics of a text. """ dmo = fuzzy.DMetaphone() name_sound_bytes, _ = dmo(name) name_sound = name_sound_bytes.decode() return name_sound
def match_double_metaphone(token): dictSet = getDict() candidates = [] candidatesG = [] class1 = [] class2 = [] class3 = [] hasClass1 = False hasClass2 = False bestMatch = "" dmeta = fuzzy.DMetaphone() dm_token = dmeta(token) dm_token_pk = dm_token[0] dm_token_sk = dm_token[1] for match in dictSet: dm_match = dmeta(match) dm_match_pk = dm_match[0] dm_match_sk = dm_match[1] if (dm_token_pk != 'None') and (dm_token_pk == dm_match_pk): hasClass1 = True class1.append(match) continue if (not hasClass1) and ( (dm_token_pk != 'None' and dm_token_pk == dm_match_sk) or (dm_token_sk != 'None' and dm_token_sk == dm_match_pk)): hasClass2 = True class2.append(match) continue if (not hasClass2) and (dm_token_sk != 'None' and dm_token_sk == dm_match_sk): class3.append(match) if hasClass1: candidates = class1 elif hasClass2: candidates = class2 else: candidates = class3 if len(candidates) > 1: G = ngram.NGram(candidates) candidatesG = G.search(token) if len(candidatesG) > 0: bestMatch = candidatesG[0][0] elif len(candidates) == 1: bestMatch = candidates[0] return bestMatch, candidates, candidatesG
def phonetic_tokenize_name(name, phonetic_algorithm="double_metaphone"): """Create Double Metaphone tokens from the string. Parameters ---------- :param name: string Name of the author. Usually it should be in the format: surnames, first names. :param phonetic algorithm: string Which phonetic algorithm will be used. Options: - "double_metaphone" - "nysiis" - "soundex" Returns ------- :return: tuple The first element is a tuple with the tokens for surnames, the second is a tuple with the tokens for first names. The tuple always contains exactly two elements. Only the first results of the double metaphone algorithm are included in tuples. """ if phonetic_algorithm == "soundex": error = ( "The version of the 'fuzzy' package in use has a buggy soundex" " implementation (see https://github.com/yougov/fuzzy/issues/14 )," " downgrade the package to 1.1 (compatible with Python 2 only) if" " you want to use the soundex phonetic encoding.") try: if fuzzy.Soundex(4)("fuzzy") != "F200": raise ValueError(error) except UnicodeDecodeError: raise ValueError(error) dm = fuzzy.DMetaphone() soundex = fuzzy.Soundex(5) phonetic_algorithms = { "double_metaphone": lambda y: (dm(y)[0] or b'').decode(), "nysiis": lambda y: fuzzy.nysiis(y), "soundex": lambda y: soundex(y) } tokens = tokenize_name(name) # Use double metaphone tokens = tuple( map( lambda x: tuple( map(lambda y: phonetic_algorithms[phonetic_algorithm](y), x)), tokens)) return tokens
def double_metaphone(name=None): """ FIXME: move the import statement to the head when we up the image """ try: import fuzzy except: return [] if not name: return [] DM = fuzzy.DMetaphone() DMetaphone = DM(name) while ((len(DMetaphone) != 0) and (not DMetaphone[-1])): DMetaphone.pop() return DMetaphone
class PersonThing(_PersonThing): _fuzzy: Callable[[str], Any] = fuzzy.DMetaphone() #: our fuzzy-match generator def get_virtual_hash(self): # Since we're not looking for extreme efficiency, we'll just create a data structure that contains the base # information. datum = { 'taxid': self.taxid, 'fname': self._fuzzy(self.fname), 'lname': self._fuzzy(self.lname) } # Convert the dictionary to a string. That's our hash. return str(datum)
def doublemetaphone(value, search): """Compares two strings by applying the Double Metaphone phonetic encoding algorithm of the Fuzzy library using primary and secondary code of a string for matching.""" dmeta = fuzzy.DMetaphone() dmeta_value = dmeta(value.encode("utf-8")) dmeta_search = dmeta(search.encode("utf-8")) if value == search: return True if dmeta_value[1] is not None and dmeta_search[1] is not None: for v in dmeta_value: for s in dmeta_search: if v == s: return True return dmeta_value == dmeta_search
def name_check(self,name,namelist): #Name is in the name dictionary if name in namelist: return None #Name is not in the dictionary, find the most similar using Double Metaphone and similar ratio else: dmeta = fuzzy.DMetaphone() result = [] for n in namelist: if set(dmeta(n)) == set(dmeta(name)): result.append(n) score = {} for i in result: score[i] = fuzzywuzzy.fuzz.ratio(i,name) score = dict(sorted(score.items())[:3]) suggestion = ' '.join(score.keys()) return suggestion
def phonetic_tokenize_name(name, phonetic_algorithm="double_metaphone"): """Create Double Metaphone tokens from the string. Parameters ---------- :param name: string Name of the author. Usually it should be in the format: surnames, first names. :param phonetic algorithm: string Which phonetic algorithm will be used. Options: - "double_metaphone" - "nysiis" (only for Python 2) - "soundex" (only for Python 2) Returns ------- :return: tuple The first element is a tuple with the tokens for surnames, the second is a tuple with the tokens for first names. The tuple always contains exactly two elements. Only the first results of the double metaphone algorithm are included in tuples. """ if sys.version[0] == '2': import fuzzy dm = fuzzy.DMetaphone() soundex = fuzzy.Soundex(5) phonetic_algorithms = { "double_metaphone": lambda y: dm(y)[0] or '', "nysiis": lambda y: fuzzy.nysiis(y), "soundex": lambda y: soundex(y) } else: from ..ext.metaphone import dm phonetic_algorithms = {"double_metaphone": lambda y: dm(y)[0]} tokens = tokenize_name(name) # Use double metaphone tokens = tuple( map( lambda x: tuple( map(lambda y: phonetic_algorithms[phonetic_algorithm](y), x)), tokens)) return tokens
class DoubleMetaphone: _dmeta = fuzzy.DMetaphone() def __init__(self, dataset_file): self.metaphone_dictionary = defaultdict(list) self._dataset_file = dataset_file def load_metaphone_dictionary(self): with open(self._dataset_file, "r") as dataset: for word in dataset: word = word.strip( ) # To strip newline characters from the end of the word. dmeta_result = self._dmeta(word) if dmeta_result[0]: self.metaphone_dictionary[dmeta_result[0]].append(word) if dmeta_result[1]: self.metaphone_dictionary[dmeta_result[1]].append(word)
def compare_lists(list1, list2): metaphor = fuzzy.DMetaphone() count1 = 0 count2 = 0 match_list = [] for item in list1: met1, met2 = metaphor(item) items = [item, met1, met2] for compare_item in list2: a_met1, a_met2 = metaphor(compare_item) compare_items = [compare_item, a_met1, a_met2] ans = compare_items_function(items, compare_items) if ans: match_list.append((count1, count2)) break count2 += 1 count2 = 0 count1 += 1 return match_list
def get_dmeta(string): """ Get double metaphone representation of a string. Written by Dillon Ranwala """ dmeta = fuzzy.DMetaphone() splitstring = string.split() stringlist = [] encoding = 'utf-8' for word in splitstring: stringlist.append(dmeta(word)[0]) # Converts nonetypes to bytes for empty string stringlist = [(bytes('', encoding)) if word is None else word for word in stringlist] #decoding bytes into a unicode string for each word bytes2str = [] for byte in stringlist: b2str = byte.decode(encoding) bytes2str.append(b2str) finalstr = ' '.join(bytes2str) return finalstr
def main(args): ensure_exist(args.outdir, is_dir=True) json.dump(vars(args), open(os.path.join(args.outdir, 'config.json'), 'w')) unigram_model = UnigramModel(args.word_counts_path, args.oov_prob) retriever = Retriever(args.doc_file, path=args.retriever_model, overwrite=args.overwrite_retriever_model) if args.system.startswith('rule') or args.system == 'keywords' or args.scorer in ('goodman',): skipgram = SkipGram.load_model(args.skipgram_model[0], args.skipgram_model[1], embedding_size=args.skipgram_embed_size, cpu=args.cpu) else: skipgram = None if args.scorer == 'random': scorer = RandomScorer() elif args.scorer == 'surprisal': lm = LMScorer.load_model(args.lm_path) scorer = SurprisalScorer(lm, unigram_model, local_window_size=args.local_window_size) elif args.scorer == 'goodman': scorer = GoodmanScorer(unigram_model, skipgram) type_recognizer = TypeRecognizer(threshold=args.type_consistency_threshold) if args.system == 'rule': generator = RulebasedGenerator(retriever, skipgram, type_recognizer, scorer, dist_to_pun=args.distance_to_pun_word) elif args.system == 'rule+neural': generator = NeuralCombinerGenerator(retriever, skipgram, type_recognizer, scorer, args.distance_to_pun_word, args) elif args.system == 'retrieve': generator = RetrieveGenerator(retriever, scorer) elif args.system == 'retrieve+swap': generator = RetrieveSwapGenerator(retriever, scorer) puns = json.load(open(args.pun_words)) # Uniq d = {} for e in puns: d[e['pun_word']] = e puns = d.values() # Sorting by quality of pun words dmeta = fuzzy.DMetaphone() homophone = lambda x, y: float(dmeta(x)[0] == dmeta(y)[0]) length = lambda x, y: float(len(x) > 2 and len(y) > 2) freq = lambda x, y: unigram_model.word_counts.get(x, 0) * unigram_model.word_counts.get(y, 0) puns = sorted(puns, key=lambda e: (length(e['pun_word'], e['alter_word']), homophone(e['pun_word'], e['alter_word']), freq(e['pun_word'], e['alter_word'])), reverse=True) num_success = 0 processed_examples = [] for example in puns: pun_word, alter_word = example['pun_word'], example['alter_word'] logger.info('-'*50) logger.info('INPUT: alter={} pun={}'.format(alter_word, pun_word)) logger.info('REFERENCE: {}'.format(' '.join(example['tokens']))) logger.info('-'*50) feasible, reason = feasible_pun_words(pun_word, alter_word, unigram_model, skipgram=skipgram, freq_threshold=args.pun_freq_threshold) if not feasible: example['fail'] = reason continue results = generator.generate(alter_word, pun_word, k=args.num_topic_words, ncands=args.num_candidates, ntemps=args.num_templates) example['results'] = results if not results: continue results = [r for r in results if r.get('score') is not None] results = sorted(results, key=lambda r: r['score'], reverse=True) for r in results[:3]: logger.info('{:<8.2f}{}'.format(r['score'], ' '.join(r['output']))) processed_examples.append(example) num_success += 1 if args.max_num_examples > 0 and num_success >= args.max_num_examples: break json.dump(processed_examples, open(os.path.join(args.outdir, 'results.json'), 'w'))
import fuzzy import jellyfish """ double metaphone: ----------------- (Primary Key = Primary Key) = Strongest Match (Secondary Key = Primary Key) = Normal Match (Primary Key = Secondary Key) = Normal Match """ global dm dm = fuzzy.DMetaphone() global conf conf = {"dist": "get_closest_jaro", "phon": "get_close_dmeta"} #conf["dist"] = "get_closest_jaro_winkler" global comp_map comp_map = {} comp_list = [ "get_closest_jaro", "get_closest_jaro_winkler", "get_closest_hamming", "get_closest_damerau_levenshtein", "get_closest_levenshtein", "get_close_dmeta" ] def get_closest_match(needle, haystack): global conf global comp_map res = None
import fuzzy import geohash import re import six from postal.expand import expand_address, ADDRESS_NAME, ADDRESS_STREET, ADDRESS_UNIT, ADDRESS_LEVEL, ADDRESS_HOUSE_NUMBER, ADDRESS_POSTAL_CODE, ADDRESS_TOPONYM from lieu.address import AddressComponents, VenueDetails, Coordinates from lieu.api import DedupeResponse from lieu.similarity import ordered_word_count, soft_tfidf_similarity, jaccard_similarity from lieu.encoding import safe_encode, safe_decode from lieu.floats import isclose double_metaphone = fuzzy.DMetaphone() whitespace_regex = re.compile('[\s]+') class AddressDeduper(object): DEFAULT_GEOHASH_PRECISION = 7 @classmethod def component_equals(cls, c1, c2, component, no_whitespace=True): if not c1 or not c2: return False c1 = safe_decode(c1) c2 = safe_decode(c2) if no_whitespace and whitespace_regex.sub( u'', c1.lower()) == whitespace_regex.sub(u'', c2.lower()): return True
# Copyright (c) 2017 Civic Knowledge. This file is licensed under the terms of the # MIT License, included in this distribution as LICENSE """ Functions for processing value labels """ import csv from pathlib import Path from tqdm import tqdm import fuzzy from collections import defaultdict dmeta = fuzzy.DMetaphone() from nltk.tokenize import RegexpTokenizer tokenizer = RegexpTokenizer(r'\w+') def dmeta_sub(s1, s2): try: p1 = sorted(dmeta(str(e))[0] for e in tokenizer.tokenize(str(s1))) p2 = sorted(dmeta(str(e))[0] for e in tokenizer.tokenize(str(s2))) except TypeError: # print("!!! {}, {}".format(s1, s2)) return 100 # raise if all(w1 in p2 for w1 in p1) or all(w2 in p1 for w2 in p2): return 0
def file1(): correct = 0 incorrect = 0 f = open("/home/loksuvidha/Desktop/data.csv", "r") data = csv.reader(f) total_rows = 0 for row in data: print(row) total_rows = total_rows + 1 proposal_name = remove_prefix(row[0].split(' ')) proposal_name = remove_prefix(row[0].split(' ')) rto_name = remove_prefix(row[1].split(' ')) min_length = min(len(proposal_name), len(rto_name)) print(proposal_name, rto_name) dmetaphone = fuzzy.DMetaphone(4) sum_of_counter = 0 for i in range(len(proposal_name)): counter = 0 j = 0 while j < (len(rto_name)): word_of_proposal_name = proposal_name[i].upper() word_of_rto_name = rto_name[j].upper() if (word_of_proposal_name in word_of_rto_name or word_of_rto_name in word_of_proposal_name): counter = counter + 1 elif (dmetaphone(word_of_proposal_name) == dmetaphone( word_of_rto_name)): counter = counter + 1 else: if (Levenshtein.distance(word_of_proposal_name, word_of_rto_name) <= 2): counter = counter + 1 j = j + 1 sum_of_counter = sum_of_counter + counter # print("Counter=",sum_of_counter) if (sum_of_counter >= 2 or sum_of_counter >= min_length): # print("Correct name:") correct = correct + 1 else: print("Incorrect name:") incorrect = incorrect + 1 print("Total rows=", total_rows) print("Correct name=", correct) print("Incorrect name=", incorrect) f.close()
def compareByDoubleMetaphone(word1, word2): dmeta = fuzzy.DMetaphone(4) return dmeta(word1)[0] == dmeta(word2)[0] or ( dmeta(word1)[1] == dmeta(word2)[1] != None and dmeta(word1)[1] == dmeta(word2)[1])
__author__ = '[email protected] (Spencer Kimball)' import s2 import re import struct from viewfinder.backend.base import base64hex from viewfinder.backend.base.util import ConvertToString from viewfinder.backend.db import stopwords try: # We have two double metaphone implementations available: # The one in "fuzzy" is faster, but doesn't work on pypy. import fuzzy _D_METAPHONE = fuzzy.DMetaphone() except ImportError: import metaphone _D_METAPHONE = metaphone.doublemetaphone class Indexer(object): """An indexer creates arbitrary secondary indexes for a column by transforming the column value into a set of index terms. Each index term will be stored as a link back to the object containing the column. The set of index terms are actually the keys to a python dict, with value being an opaque datum to be retrieved in addition to the primary key of the object (more on the utility of this below). The simplest example of an Indexer would return the exact value of the column. This is equivalent to creating a secondary key on the column in a relational database. The object can now be queried by
def get_rule_parameters(vdict, target): """単語のルールテーブルの割合を求める 誤差はしょうがないか... Parameters ---------- vdict : verb dict target : target verb Return ------ rule_paramters : dict 母音とその平均距離 """ verbparser = ParseWord.verbparser vowel = 1 stemlist = vdict.keys() dmeta = fuzzy.DMetaphone() verb_combi = [(target, v) for v in stemlist if v != target] dmeta_combi = [(dmeta(x)[0], dmeta(y)[0]) for x, y in verb_combi] dmeta_distance = [distance(*x) for x in dmeta_combi] distance_rank = {} # (target, verb) : 距離 for x, y in zip(verb_combi, dmeta_distance): distance_rank[x] = y distance_distribute = {} # 距離がvの単語がいくつあるか for k, v in distance_rank.items(): distance_distribute[v] = distance_distribute.get(v, 0) + 1 # 距離dの単語の過去形の母音交代パターンを求める past_vowels = {} # {距離: [母音,母音,....]} vowels_distance = {} for k, v in distance_distribute.items(): vowels = [] for pair, d in distance_rank.items(): if k == d: vowel_string = (verbparser(vdict[pair[1]])[vowel]) vowels.append(vowel_string) vowels_distance[vowel_string] = vowels_distance.get( vowel_string, 0) + d past_vowels[k] = vowels patterms = past_vowels.values() patterms = list(chain.from_iterable(patterms)) vowels_appear = {} for c in patterms: vowels_appear[c] = vowels_appear.get(c, 0) + 1 # あるルールの平均距離を求める 距離のアルゴリズム上 # あまり差がでない rule_parameters = {} for vo, dis in vowels_distance.items(): avg = np.divide(np.double(dis), np.double(vowels_appear[vo])) # 小数点以下2桁で四捨五入 rule_parameters[vo] = np.around(avg, decimals=2) return rule_parameters """
def test_DMetaphone(): m = fuzzy.DMetaphone() assert m("mayer") == [b'MR', None]