Esempio n. 1
0
def check_words_equal(word1, word2):
    #remove punctuation from word1
    word1_mod = word1.translate(str.maketrans("", "", string.punctuation))
    word2_mod = word2.translate(str.maketrans("", "", string.punctuation))
    d_meta = fuzzy.DMetaphone()
    #fuzzy match
    return d_meta(word1_mod) == d_meta(word2)
def dMetaphone(collection):
    """
    Returns a list of metaphone encoded collection.
    
    Arguments:
    collection  -- the list of words to be encoded using metaphone.
    limit       -- the limit to the words.
    """
    
    try:
        assert type(collection) == list or type(collection) == str
    except AssertionError:
        print("The collection for metaphone is not a string or a list.")

    import fuzzy
    dmetaphone = fuzzy.DMetaphone()

    if type(collection) == str:
        return dmetaphone(collection)
 
    collectionEncoded = list()
    for word in collection:
        wordEncoded = dmetaphone(word)
        if wordEncoded[0] is not None:
            wordEncoded[0] = wordEncoded[0].decode('UTF-8')
        if wordEncoded[1] is not None:
            wordEncoded[1] = wordEncoded[1].decode('UTF-8')

        collectionEncoded.append(wordEncoded)
        

    return collectionEncoded
Esempio n. 3
0
    def get_index_keys(content, add=True):
        # Very simple word-based parser.  We skip stop words and single
        # character words.
        words = NON_WORDS.sub(' ', content.lower()).split()
        words = [word.strip("'") for word in words]
        words = [word for word in words
            if word not in STOP_WORDS and len(word) > 1]

        if use_stem:
            stemmer = Stemmer.Stemmer('english')
            words = stemmer.stemWords(words)

        if use_metaphone:
            dmeta = fuzzy.DMetaphone()
            import itertools
            w = []
            [ w.extend(list(itertools.chain(dmeta(word)))) for word in words]
            words = filter (lambda a: a != None, w)

        if not add:
            return words

        # Calculate the TF portion of TF/IDF.
        counts = collections.defaultdict(float)
        for word in words:
            counts[word] += 1
        wordcount = len(words)
        tf = dict((word, count / wordcount)
                    for word, count in counts.iteritems())
        return tf
Esempio n. 4
0
def get_suggestion(word, tree, meta_dict):
    dmeta = fuzzy.DMetaphone()

    words_list = tree.query(word, 1)

    words_list1 = []
    words_list2 = []
    # for removing the edit distance value present in wordList
    for i in range(0, len(words_list)):
        words_list1.append(words_list[i][1])

    dmeta_result = dmeta(word)

    if dmeta_result[0] is not None:
        key1 = dmeta_result[0]
        try:
            words_list2 = meta_dict[key1]
        except KeyError:
            pass

        if dmeta_result[1] is not None:
            key2 = dmeta_result[1]
            try:
                words_list2.extend(meta_dict[key2])
            except KeyError:
                pass

    # Find intersection of the two list
    words_list3 = list(set(words_list1) & set(words_list2))

    return [words_list1, words_list2, words_list3]
Esempio n. 5
0
def compile_people(source, playing, games):
    playing = pd.merge(playing,
                       games[['key', 'league']],
                       left_on='game.key',
                       right_on='key')
    playing['league'] = playing['league'].apply(
        lambda x: x + " League"
        if "League" not in x and "Association" not in x else x)
    playing['year'] = playing['game.date'].str.split("-").str[0]
    playing['B_G'] = 1
    for pos in ['p', 'c', '1b', '2b', '3b', 'ss', 'lf', 'cf', 'rf']:
        playing['F_%s_G' % pos.upper()] = playing['pos'].apply(lambda x: (
            1 if pos in x.split("-") else 0) if not pd.isnull(x) else 0)
    playing['F_OF_G'] = playing['F_LF_G'] | playing['F_CF_G'] | \
                        playing['F_RF_G']
    playing['P_G'] = playing['F_P_G']
    playing['name.first'] = playing['name.first'].fillna("")
    playing = playing[playing['name.last'] != "TOTALS"]
    grouper = playing.groupby(
        ['year', 'league', 'name.last', 'name.first', 'club.name'])
    df = grouper.sum()
    df = pd.merge(df,
                  grouper[['game.date'
                           ]].min().rename(columns={'game.date': 'S_FIRST'}),
                  left_index=True,
                  right_index=True)
    df = pd.merge(df,
                  grouper[['game.date'
                           ]].max().rename(columns={'game.date': 'S_LAST'}),
                  left_index=True,
                  right_index=True)
    df.reset_index(inplace=True)

    df['metaphone'] = df['name.last'].apply(lambda x: fuzzy.DMetaphone()
                                            (x.split("[")[0])[0].ljust(4, 'Z'))
    df['metaseq'] = df.groupby(['year', 'league', 'metaphone']).cumcount() + 1
    df['metacount'] = df.groupby(['year', 'league',
                                  'metaphone'])['metaseq'].transform('count')
    df['person.ref'] = df.apply(lambda x: "%s%02d%02d" %
                                (x.metaphone, x.metaseq, x.metacount),
                                axis=1)
    df.rename(inplace=True,
              columns={
                  'name.last': 'person.name.last',
                  'name.first': 'person.name.given',
                  'club.name': 'entry.name',
                  'year': 'league.year',
                  'league': 'league.name'
              })
    df = df[[
        'league.year', 'league.name', 'person.ref', 'person.name.last',
        'person.name.given', 'entry.name', 'S_FIRST', 'S_LAST', 'B_G', 'P_G',
        'F_1B_G', 'F_2B_G', 'F_3B_G', 'F_SS_G', 'F_OF_G', 'F_LF_G', 'F_CF_G',
        'F_RF_G', 'F_C_G', 'F_P_G'
    ]]
    df.to_csv("processed/%s/people.csv" % source,
              index=False,
              float_format='%d')
Esempio n. 6
0
    def __init__(self,
                 word_set,
                 unigrams,
                 k,
                 costs=None,
                 lamda=1,
                 alphabet='abcdefghijklmnopqrstuvwxyz'):

        # Initialize alphabet
        self.alphabet = alphabet

        # Store all known words
        self.dict_words = word_set

        # Build and store valid prefixes
        self.valid_prefixes = set([])
        for word in self.dict_words:
            for i in range(len(word) + 1):
                self.valid_prefixes.add(word[:i])

        # Weighting likelihood & prior
        self.lamda = lamda

        # Store unigram probabilities - Use Laplace Add-k Smoothing for log probabilities
        self.priors = {}
        self.k = k
        self.N = sum(
            (count for word, count in unigrams)) + k * len(unigrams) + k
        for word, count in unigrams:
            self.priors[word] = math.log(float(count + k) / self.N)

        # Edit Distance Costs
        if costs != None:
            self.insert_costs = costs['ins_costs']
            self.delete_costs = costs['del_costs']
            self.substitute_costs = costs['sub_costs']
            self.transpose_costs = costs['trans_costs']
        else:
            self.insert_costs = np.ones((128, ))
            self.delete_costs = np.ones((128, ))
            self.transpose_costs = np.ones((128, 128))
            self.substitute_costs = np.ones((128, 128))

        # Build phonetic index - Double Metaphone
        self.dmeta = fuzzy.DMetaphone()
        self.phonetic_buckets = {}

        for word in self.dict_words:
            phonetic_idx = self.dmeta(word)

            if phonetic_idx[0] not in self.phonetic_buckets:
                self.phonetic_buckets[phonetic_idx[0]] = []
            self.phonetic_buckets[phonetic_idx[0]].append(word)

            if phonetic_idx[1] != None:
                if phonetic_idx[1] not in self.phonetic_buckets:
                    self.phonetic_buckets[phonetic_idx[1]] = []
                self.phonetic_buckets[phonetic_idx[1]].append(word)
def get_levenshtein_phonetic_similarity(osm_name, source_name):
    dmeta = fuzzy.DMetaphone()
    try:
        dmeta_osm = dmeta(osm_name)[0]  #.decode("utf-8")
        dmeta_source = dmeta(source_name)[0]  #.decode("utf-8")
        return Levenshtein.ratio(dmeta_osm, dmeta_source)

    except Exception as err:
        return None
Esempio n. 8
0
def phonetic_equal(str1, str2):
    """Check if two strings are equal when the substrings (splitted by _) are permuated"""
    if "fuzzy" in dir():
        return False

    dm = fuzzy.DMetaphone(4)

    if [dm(x) for x in str1.split("_")] == [dm(x) for x in str2.split("_")]:
        return True
Esempio n. 9
0
def compare(input_list, keywords_dictionary, word_weights):
    # Load phonetics functions
    dmeta = fuzzy.DMetaphone()
    metaphone = lambda x: dmeta(x)[0]
    soundex = fuzzy.Soundex(4)
    phonetics_methods = [metaphone, soundex]

    # initiate empty dictionary for scores
    scores = {}

    # Iterate through methods for solving, then iterate through words in
    # scrubbed user input. For each word, compare phonetics to all keywords
    # and add score to the scores dictionary. After, do normal QWERTY and LD
    # analyses
    for method, keywords in keywords_dictionary.iteritems():
        scores[method] = 0
        # print(method)
        # Phonetic Scoring methods
        for phonetic in phonetics_methods:
            formatted_array = np.asarray(map(phonetic, keywords))

            for word in input_list:
                formatted_word = phonetic(word)
                dist_array = \
                normalized_damerau_levenshtein_distance_withNPArray(
                 formatted_word, formatted_array)

                dist = min(dist_array)

                # Handle cases where "not" was found within the input - add to
                #    scores dictionary.
                weight = word_weights.get(word) if word_weights.get(
                    word) else 1

                scores[method] += weight * math.sqrt(dist)

        # For QWERTY and Damerau-Levenshtein distances, calcuate the differences
        for word in input_list:
            # Do QWERTY Keyboard analysis
            dist_array = normalized_keyboard_word_distance_withNPArray(
                word, keywords)
            dist = min(dist_array)

            # handle weighting for position from "not"
            weight = word_weights.get(word) if word_weights.get(word) else 1
            scores[method] += weight * math.sqrt(dist)

            # Do normal LD analysis
            dist_array = normalized_damerau_levenshtein_distance_withNPArray(
                word, np.asarray(keywords))
            dist = min(dist_array)

            weight = word_weights.get(word) if word_weights.get(word) else 1
            scores[method] += weight * math.sqrt(dist)

    return scores
Esempio n. 10
0
    def generateMetaphoneHash(self, dictionary, table=None):
        metaphoneHash = {} if table is None else table

        for name, gender in dictionary.iteritems():
            name = self._sanitizeName(name)

            if len(name) > 1:
                metaphonehash = fuzzy.DMetaphone()(name)
                self._appendToDict(metaphonehash, gender, metaphoneHash)

        return metaphoneHash
Esempio n. 11
0
    def _get_name_sound(self, name):
        """
        Convert a name to its 'sound'.

        We use the 'fuzzy' module, which offers different algorithsm to find the phontics of a
        text.

        """
        dmo = fuzzy.DMetaphone()
        name_sound_bytes, _ = dmo(name)
        name_sound = name_sound_bytes.decode()
        return name_sound
Esempio n. 12
0
def match_double_metaphone(token):
    dictSet = getDict()
    candidates = []
    candidatesG = []
    class1 = []
    class2 = []
    class3 = []
    hasClass1 = False
    hasClass2 = False
    bestMatch = ""

    dmeta = fuzzy.DMetaphone()
    dm_token = dmeta(token)
    dm_token_pk = dm_token[0]
    dm_token_sk = dm_token[1]

    for match in dictSet:
        dm_match = dmeta(match)
        dm_match_pk = dm_match[0]
        dm_match_sk = dm_match[1]

        if (dm_token_pk != 'None') and (dm_token_pk == dm_match_pk):
            hasClass1 = True
            class1.append(match)
            continue
        if (not hasClass1) and (
            (dm_token_pk != 'None' and dm_token_pk == dm_match_sk) or
            (dm_token_sk != 'None' and dm_token_sk == dm_match_pk)):
            hasClass2 = True
            class2.append(match)
            continue

        if (not hasClass2) and (dm_token_sk != 'None'
                                and dm_token_sk == dm_match_sk):
            class3.append(match)

    if hasClass1:
        candidates = class1
    elif hasClass2:
        candidates = class2
    else:
        candidates = class3

    if len(candidates) > 1:
        G = ngram.NGram(candidates)
        candidatesG = G.search(token)
        if len(candidatesG) > 0:
            bestMatch = candidatesG[0][0]
    elif len(candidates) == 1:
        bestMatch = candidates[0]

    return bestMatch, candidates, candidatesG
Esempio n. 13
0
def phonetic_tokenize_name(name, phonetic_algorithm="double_metaphone"):
    """Create Double Metaphone tokens from the string.

     Parameters
    ----------
    :param name: string
        Name of the author. Usually it should be in the format:
        surnames, first names.

    :param phonetic algorithm: string
        Which phonetic algorithm will be used. Options:
        -  "double_metaphone"
        -  "nysiis"
        -  "soundex"

    Returns
    -------
    :return: tuple
        The first element is a tuple with the tokens for surnames, the second
        is a tuple with the tokens for first names. The tuple always contains
        exactly two elements. Only the first results of the double metaphone
        algorithm are included in tuples.
    """
    if phonetic_algorithm == "soundex":
        error = (
            "The version of the 'fuzzy' package in use has a buggy soundex"
            " implementation (see https://github.com/yougov/fuzzy/issues/14 ),"
            " downgrade the package to 1.1 (compatible with Python 2 only) if"
            " you want to use the soundex phonetic encoding.")
        try:
            if fuzzy.Soundex(4)("fuzzy") != "F200":
                raise ValueError(error)
        except UnicodeDecodeError:
            raise ValueError(error)

    dm = fuzzy.DMetaphone()
    soundex = fuzzy.Soundex(5)
    phonetic_algorithms = {
        "double_metaphone": lambda y: (dm(y)[0] or b'').decode(),
        "nysiis": lambda y: fuzzy.nysiis(y),
        "soundex": lambda y: soundex(y)
    }

    tokens = tokenize_name(name)
    # Use double metaphone
    tokens = tuple(
        map(
            lambda x: tuple(
                map(lambda y: phonetic_algorithms[phonetic_algorithm](y), x)),
            tokens))

    return tokens
Esempio n. 14
0
def double_metaphone(name=None):
    """
    FIXME: move the import statement to the head when we up the image
    """
    try:
        import fuzzy
    except:
        return []
    if not name:
        return []
    DM = fuzzy.DMetaphone()
    DMetaphone = DM(name)
    while ((len(DMetaphone) != 0) and (not DMetaphone[-1])):
        DMetaphone.pop()
    return DMetaphone
Esempio n. 15
0
class PersonThing(_PersonThing):

    _fuzzy: Callable[[str],
                     Any] = fuzzy.DMetaphone()  #: our fuzzy-match generator

    def get_virtual_hash(self):
        # Since we're not looking for extreme efficiency, we'll just create a data structure that contains the base
        # information.
        datum = {
            'taxid': self.taxid,
            'fname': self._fuzzy(self.fname),
            'lname': self._fuzzy(self.lname)
        }
        # Convert the dictionary to a string.  That's our hash.
        return str(datum)
Esempio n. 16
0
def doublemetaphone(value, search):
    """Compares two strings by applying the Double Metaphone phonetic
    encoding algorithm of the Fuzzy library using primary and secondary
    code of a string for matching."""
    dmeta = fuzzy.DMetaphone()
    dmeta_value = dmeta(value.encode("utf-8"))
    dmeta_search = dmeta(search.encode("utf-8"))

    if value == search:
        return True

    if dmeta_value[1] is not None and dmeta_search[1] is not None:
        for v in dmeta_value:
            for s in dmeta_search:
                if v == s:
                    return True
    return dmeta_value == dmeta_search
Esempio n. 17
0
	def name_check(self,name,namelist):
		#Name is in the name dictionary
		if name in namelist:
			return None
		#Name is not in the dictionary, find the most similar using Double Metaphone and similar ratio
		else:
			dmeta = fuzzy.DMetaphone()
			result = []
			for n in namelist:
				if set(dmeta(n)) == set(dmeta(name)):
					result.append(n)
			score = {}
			for i in result:
				score[i] = fuzzywuzzy.fuzz.ratio(i,name)
			score = dict(sorted(score.items())[:3])
			suggestion = ' '.join(score.keys())
			return suggestion
Esempio n. 18
0
def phonetic_tokenize_name(name, phonetic_algorithm="double_metaphone"):
    """Create Double Metaphone tokens from the string.

     Parameters
    ----------
    :param name: string
        Name of the author. Usually it should be in the format:
        surnames, first names.

    :param phonetic algorithm: string
        Which phonetic algorithm will be used. Options:
        -  "double_metaphone"
        -  "nysiis" (only for Python 2)
        -  "soundex" (only for Python 2)

    Returns
    -------
    :return: tuple
        The first element is a tuple with the tokens for surnames, the second
        is a tuple with the tokens for first names. The tuple always contains
        exactly two elements. Only the first results of the double metaphone
        algorithm are included in tuples.
    """
    if sys.version[0] == '2':
        import fuzzy
        dm = fuzzy.DMetaphone()
        soundex = fuzzy.Soundex(5)
        phonetic_algorithms = {
            "double_metaphone": lambda y: dm(y)[0] or '',
            "nysiis": lambda y: fuzzy.nysiis(y),
            "soundex": lambda y: soundex(y)
        }
    else:
        from ..ext.metaphone import dm
        phonetic_algorithms = {"double_metaphone": lambda y: dm(y)[0]}

    tokens = tokenize_name(name)
    # Use double metaphone
    tokens = tuple(
        map(
            lambda x: tuple(
                map(lambda y: phonetic_algorithms[phonetic_algorithm](y), x)),
            tokens))

    return tokens
Esempio n. 19
0
class DoubleMetaphone:
    _dmeta = fuzzy.DMetaphone()

    def __init__(self, dataset_file):
        self.metaphone_dictionary = defaultdict(list)
        self._dataset_file = dataset_file

    def load_metaphone_dictionary(self):
        with open(self._dataset_file, "r") as dataset:
            for word in dataset:
                word = word.strip(
                )  # To strip newline characters from the end of the word.
                dmeta_result = self._dmeta(word)

                if dmeta_result[0]:
                    self.metaphone_dictionary[dmeta_result[0]].append(word)

                if dmeta_result[1]:
                    self.metaphone_dictionary[dmeta_result[1]].append(word)
Esempio n. 20
0
def compare_lists(list1, list2):
    metaphor = fuzzy.DMetaphone()
    count1 = 0
    count2 = 0
    match_list = []
    for item in list1:
        met1, met2 = metaphor(item)
        items = [item, met1, met2]
        for compare_item in list2:
            a_met1, a_met2 = metaphor(compare_item)
            compare_items = [compare_item, a_met1, a_met2]
            ans = compare_items_function(items, compare_items)
            if ans:
                match_list.append((count1, count2))
                break
            count2 += 1
        count2 = 0
        count1 += 1
    return match_list
Esempio n. 21
0
def get_dmeta(string):
    """
  Get double metaphone representation of a string. Written by Dillon Ranwala
  """
    dmeta = fuzzy.DMetaphone()
    splitstring = string.split()
    stringlist = []
    encoding = 'utf-8'
    for word in splitstring:
        stringlist.append(dmeta(word)[0])

    # Converts nonetypes to bytes for empty string
    stringlist = [(bytes('', encoding)) if word is None else word
                  for word in stringlist]

    #decoding bytes into a unicode string for each word
    bytes2str = []
    for byte in stringlist:
        b2str = byte.decode(encoding)
        bytes2str.append(b2str)

    finalstr = ' '.join(bytes2str)
    return finalstr
Esempio n. 22
0
def main(args):
    ensure_exist(args.outdir, is_dir=True)
    json.dump(vars(args), open(os.path.join(args.outdir, 'config.json'), 'w'))

    unigram_model = UnigramModel(args.word_counts_path, args.oov_prob)
    retriever = Retriever(args.doc_file, path=args.retriever_model, overwrite=args.overwrite_retriever_model)

    if args.system.startswith('rule') or args.system == 'keywords' or args.scorer in ('goodman',):
        skipgram = SkipGram.load_model(args.skipgram_model[0], args.skipgram_model[1], embedding_size=args.skipgram_embed_size, cpu=args.cpu)
    else:
        skipgram = None

    if args.scorer == 'random':
        scorer = RandomScorer()
    elif args.scorer == 'surprisal':
        lm = LMScorer.load_model(args.lm_path)
        scorer = SurprisalScorer(lm, unigram_model, local_window_size=args.local_window_size)
    elif args.scorer == 'goodman':
        scorer = GoodmanScorer(unigram_model, skipgram)

    type_recognizer = TypeRecognizer(threshold=args.type_consistency_threshold)

    if args.system == 'rule':
        generator = RulebasedGenerator(retriever, skipgram, type_recognizer, scorer, dist_to_pun=args.distance_to_pun_word)
    elif args.system == 'rule+neural':
        generator = NeuralCombinerGenerator(retriever, skipgram, type_recognizer, scorer, args.distance_to_pun_word, args)
    elif args.system == 'retrieve':
        generator = RetrieveGenerator(retriever, scorer)
    elif args.system == 'retrieve+swap':
        generator = RetrieveSwapGenerator(retriever, scorer)

    puns = json.load(open(args.pun_words))
    # Uniq
    d = {}
    for e in puns:
        d[e['pun_word']] = e
    puns = d.values()
    # Sorting by quality of pun words
    dmeta = fuzzy.DMetaphone()
    homophone = lambda x, y: float(dmeta(x)[0] == dmeta(y)[0])
    length = lambda x, y: float(len(x) > 2 and len(y) > 2)
    freq = lambda x, y: unigram_model.word_counts.get(x, 0) * unigram_model.word_counts.get(y, 0)
    puns = sorted(puns, key=lambda e: (length(e['pun_word'], e['alter_word']),
                                       homophone(e['pun_word'], e['alter_word']),
                                       freq(e['pun_word'], e['alter_word'])),
                  reverse=True)
    num_success = 0
    processed_examples = []
    for example in puns:
        pun_word, alter_word = example['pun_word'], example['alter_word']
        logger.info('-'*50)
        logger.info('INPUT: alter={} pun={}'.format(alter_word, pun_word))
        logger.info('REFERENCE: {}'.format(' '.join(example['tokens'])))
        logger.info('-'*50)

        feasible, reason = feasible_pun_words(pun_word, alter_word, unigram_model, skipgram=skipgram, freq_threshold=args.pun_freq_threshold)
        if not feasible:
            example['fail'] = reason
            continue

        results = generator.generate(alter_word, pun_word, k=args.num_topic_words, ncands=args.num_candidates, ntemps=args.num_templates)
        example['results'] = results
        if not results:
            continue

        results = [r for r in results if r.get('score') is not None]
        results = sorted(results, key=lambda r: r['score'], reverse=True)
        for r in results[:3]:
            logger.info('{:<8.2f}{}'.format(r['score'], ' '.join(r['output'])))

        processed_examples.append(example)
        num_success += 1
        if args.max_num_examples > 0 and num_success >= args.max_num_examples:
            break

    json.dump(processed_examples, open(os.path.join(args.outdir, 'results.json'), 'w'))
Esempio n. 23
0
import fuzzy
import jellyfish
"""
double metaphone:
-----------------
	(Primary Key = Primary Key)   = Strongest Match
	(Secondary Key = Primary Key) = Normal Match
	(Primary Key = Secondary Key) = Normal Match
"""

global dm
dm = fuzzy.DMetaphone()

global conf
conf = {"dist": "get_closest_jaro", "phon": "get_close_dmeta"}
#conf["dist"] = "get_closest_jaro_winkler"

global comp_map
comp_map = {}

comp_list = [
    "get_closest_jaro", "get_closest_jaro_winkler", "get_closest_hamming",
    "get_closest_damerau_levenshtein", "get_closest_levenshtein",
    "get_close_dmeta"
]


def get_closest_match(needle, haystack):
    global conf
    global comp_map
    res = None
Esempio n. 24
0
import fuzzy
import geohash
import re
import six

from postal.expand import expand_address, ADDRESS_NAME, ADDRESS_STREET, ADDRESS_UNIT, ADDRESS_LEVEL, ADDRESS_HOUSE_NUMBER, ADDRESS_POSTAL_CODE, ADDRESS_TOPONYM

from lieu.address import AddressComponents, VenueDetails, Coordinates
from lieu.api import DedupeResponse
from lieu.similarity import ordered_word_count, soft_tfidf_similarity, jaccard_similarity
from lieu.encoding import safe_encode, safe_decode
from lieu.floats import isclose

double_metaphone = fuzzy.DMetaphone()
whitespace_regex = re.compile('[\s]+')


class AddressDeduper(object):
    DEFAULT_GEOHASH_PRECISION = 7

    @classmethod
    def component_equals(cls, c1, c2, component, no_whitespace=True):
        if not c1 or not c2:
            return False

        c1 = safe_decode(c1)
        c2 = safe_decode(c2)
        if no_whitespace and whitespace_regex.sub(
                u'', c1.lower()) == whitespace_regex.sub(u'', c2.lower()):
            return True
Esempio n. 25
0
# Copyright (c) 2017 Civic Knowledge. This file is licensed under the terms of the
# MIT License, included in this distribution as LICENSE
"""
Functions for processing value labels
"""
import csv

from pathlib import Path
from tqdm import tqdm

import fuzzy
from collections import defaultdict

dmeta = fuzzy.DMetaphone()

from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')


def dmeta_sub(s1, s2):
    try:
        p1 = sorted(dmeta(str(e))[0] for e in tokenizer.tokenize(str(s1)))
        p2 = sorted(dmeta(str(e))[0] for e in tokenizer.tokenize(str(s2)))
    except TypeError:
        # print("!!! {}, {}".format(s1, s2))
        return 100
        # raise

    if all(w1 in p2 for w1 in p1) or all(w2 in p1 for w2 in p2):
        return 0
Esempio n. 26
0
def file1():

    correct = 0
    incorrect = 0
    f = open("/home/loksuvidha/Desktop/data.csv", "r")

    data = csv.reader(f)
    total_rows = 0
    for row in data:
        print(row)

        total_rows = total_rows + 1

        proposal_name = remove_prefix(row[0].split(' '))
        proposal_name = remove_prefix(row[0].split(' '))
        rto_name = remove_prefix(row[1].split(' '))

        min_length = min(len(proposal_name), len(rto_name))

        print(proposal_name, rto_name)

        dmetaphone = fuzzy.DMetaphone(4)
        sum_of_counter = 0

        for i in range(len(proposal_name)):

            counter = 0
            j = 0
            while j < (len(rto_name)):

                word_of_proposal_name = proposal_name[i].upper()
                word_of_rto_name = rto_name[j].upper()

                if (word_of_proposal_name in word_of_rto_name
                        or word_of_rto_name in word_of_proposal_name):
                    counter = counter + 1

                elif (dmetaphone(word_of_proposal_name) == dmetaphone(
                        word_of_rto_name)):
                    counter = counter + 1

                else:
                    if (Levenshtein.distance(word_of_proposal_name,
                                             word_of_rto_name) <= 2):
                        counter = counter + 1
                j = j + 1

            sum_of_counter = sum_of_counter + counter
    #  print("Counter=",sum_of_counter)
        if (sum_of_counter >= 2 or sum_of_counter >= min_length):
            #      print("Correct name:")
            correct = correct + 1
        else:

            print("Incorrect name:")
            incorrect = incorrect + 1

    print("Total rows=", total_rows)
    print("Correct name=", correct)
    print("Incorrect name=", incorrect)

    f.close()
def compareByDoubleMetaphone(word1, word2):
    dmeta = fuzzy.DMetaphone(4)
    return dmeta(word1)[0] == dmeta(word2)[0] or (
        dmeta(word1)[1] == dmeta(word2)[1] != None
        and dmeta(word1)[1] == dmeta(word2)[1])
Esempio n. 28
0
__author__ = '[email protected] (Spencer Kimball)'

import s2
import re
import struct

from viewfinder.backend.base import base64hex
from viewfinder.backend.base.util import ConvertToString
from viewfinder.backend.db import stopwords

try:
  # We have two double metaphone implementations available:
  # The one in "fuzzy" is faster, but doesn't work on pypy.
  import fuzzy
  _D_METAPHONE = fuzzy.DMetaphone()
except ImportError:
  import metaphone
  _D_METAPHONE = metaphone.doublemetaphone

class Indexer(object):
  """An indexer creates arbitrary secondary indexes for a column by
  transforming the column value into a set of index terms. Each index
  term will be stored as a link back to the object containing the
  column. The set of index terms are actually the keys to a python
  dict, with value being an opaque datum to be retrieved in addition
  to the primary key of the object (more on the utility of this below).

  The simplest example of an Indexer would return the exact value of
  the column. This is equivalent to creating a secondary key on the
  column in a relational database. The object can now be queried by
Esempio n. 29
0
def get_rule_parameters(vdict, target):
    """単語のルールテーブルの割合を求める
    誤差はしょうがないか...

    Parameters
    ----------
    vdict : verb dict

    target : target verb

    Return
    ------

    rule_paramters : dict 母音とその平均距離
    """
    verbparser = ParseWord.verbparser

    vowel = 1
    stemlist = vdict.keys()
    dmeta = fuzzy.DMetaphone()
    verb_combi = [(target, v) for v in stemlist if v != target]
    dmeta_combi = [(dmeta(x)[0], dmeta(y)[0]) for x, y in verb_combi]
    dmeta_distance = [distance(*x) for x in dmeta_combi]

    distance_rank = {}  # (target, verb) : 距離
    for x, y in zip(verb_combi, dmeta_distance):
        distance_rank[x] = y

    distance_distribute = {}
    # 距離がvの単語がいくつあるか
    for k, v in distance_rank.items():
        distance_distribute[v] = distance_distribute.get(v, 0) + 1

    # 距離dの単語の過去形の母音交代パターンを求める
    past_vowels = {}  # {距離: [母音,母音,....]}
    vowels_distance = {}
    for k, v in distance_distribute.items():
        vowels = []
        for pair, d in distance_rank.items():
            if k == d:
                vowel_string = (verbparser(vdict[pair[1]])[vowel])
                vowels.append(vowel_string)
                vowels_distance[vowel_string] = vowels_distance.get(
                    vowel_string, 0) + d

        past_vowels[k] = vowels

    patterms = past_vowels.values()
    patterms = list(chain.from_iterable(patterms))
    vowels_appear = {}
    for c in patterms:
        vowels_appear[c] = vowels_appear.get(c, 0) + 1

    # あるルールの平均距離を求める 距離のアルゴリズム上
    # あまり差がでない
    rule_parameters = {}
    for vo, dis in vowels_distance.items():
        avg = np.divide(np.double(dis), np.double(vowels_appear[vo]))
        # 小数点以下2桁で四捨五入
        rule_parameters[vo] = np.around(avg, decimals=2)

    return rule_parameters

    """
Esempio n. 30
0
def test_DMetaphone():
    m = fuzzy.DMetaphone()
    assert m("mayer") == [b'MR', None]