コード例 #1
0
 def extract_keyword_from_text(self, text):
     '''
     Tokenize webpage text and extract keywords
     Input:
         text (str): text to extract keywords from
     Output:
         keywords (list): keywords extracted and filtered by pre-defined dictionary
     '''
     text = re.sub("[^a-zA-Z+3]", " ", text)
     text = text.lower().split()
     cleared_text = []
     for punctuation in string.punctuation:
         for i in range(0, len(text)):
             text[i].replace(punctuation, ' ')
             cleared_text += text[i].split()
     text = cleared_text
     stops = set(stopwords.words("english"))
     text = [w for w in text if not w in stops]
     text = list(set(text))
     keywords = [
         str(word) for word in text if (word.lower() in overall_dict)
     ]
     keywords = []
     for word in text:
         for skill in overall_dict:
             if word.lower() == skill.lower() or metaphone.doublemetaphone(
                     word) == metaphone.doublemetaphone(skill):
                 keywords.append(skill)
     return keywords
コード例 #2
0
ファイル: animeDB.py プロジェクト: JWebCoder/rasponline
 def getEpisodesMetadata(self, itemCount, offset, quant, search):
     searchQuery = search.replace("-", " ")
     meta = doublemetaphone(searchQuery)
     quantDigits = len(str(itemCount))
     replacer = meta[0][-1:]
     metaOne = meta[0].replace(replacer, "")
     metaTwo = meta[1].replace(replacer, "")
     listLinks = []
     for i in range(quant):
         target = i + offset
         if target < itemCount:
             count = target + 1
             if count < 10:
                 count = '0' * (quantDigits - 1) + str(count)
             elif count < 100:
                 count = '0' * (quantDigits - 2) + str(count)
             else:
                 count = str(count)
             results = anime.searchAnimesMetadata(searchQuery + ' ' + count, quant=3)
             for ep in results:
                 if ep != '':
                     returned = doublemetaphone(ep.title.replace("-", ""))
                     returnedOne = returned[0].replace(replacer, "")
                     returnedTwo = returned[1].replace(replacer, "")
                     if (returnedOne == metaOne):
                         listLinks.append({'title': ep.title, 'image': ep.image, 'link': ep.link})
                         break
     return listLinks
コード例 #3
0
 def getEpisodesMetadata(self, itemCount, offset, quant, search):
     searchQuery = search.replace("-", " ")
     meta = doublemetaphone(searchQuery)
     quantDigits = len(str(itemCount))
     replacer = meta[0][-1:]
     metaOne = meta[0].replace(replacer, "")
     metaTwo = meta[1].replace(replacer, "")
     listLinks = []
     for i in range(quant):
         target = i + offset
         if target < itemCount:
             count = target + 1
             if count < 10:
                 count = '0' * (quantDigits - 1) + str(count)
             elif count < 100:
                 count = '0' * (quantDigits - 2) + str(count)
             else:
                 count = str(count)
             results = anime.searchAnimesMetadata(searchQuery + ' ' + count,
                                                  quant=3)
             for ep in results:
                 if ep != '':
                     returned = doublemetaphone(ep.title.replace("-", ""))
                     returnedOne = returned[0].replace(replacer, "")
                     returnedTwo = returned[1].replace(replacer, "")
                     if (returnedOne == metaOne):
                         listLinks.append({
                             'title': ep.title,
                             'image': ep.image,
                             'link': ep.link
                         })
                         break
     return listLinks
コード例 #4
0
def adapted_doublemetaphone(data, language="en"):
    '''
    Adapted function to take into account specific topics not considered in original version
    it accepts both strings and lists of strings
    '''
    if (isinstance(data, str)):
        list_data = [data]
        using_string = True
    else:
        list_data = data
        using_string = False
    #We perform the operation ina list, and then we return the result
    result = []
    for data2met in list_data:
        if (language == "es"):
            if not re.match(r"[Cc]h", data2met):
                data2met = re.sub(r"h", "",
                                  data2met.lower().replace("ph", "f"))
            #In spanish b and v are pronunced equally, if we know the language is spanish we shall remove!
            result.append(
                doublemetaphone(data2met.lower().replace("v", "b").replace(
                    "gi", "ji").replace("ge", "je").replace("ph", "f")))
        result.append(doublemetaphone(data2met))
    solution = result
    if using_string:
        solution = result[0]
    #There has been an specific case where it has been found a complete second row empty. This is fixing it.
    if (len(solution) == 4) and (solution[-2:] == [('', ''), ('', '')]):
        solution = solution[:2]
    return solution
コード例 #5
0
def check_curse(msg):
    curse_triggers = ['bitch', 'nigger', 'f**k', 'shit', 'c**k']
    for curse in curse_triggers:
        if doublemetaphone(msg)[0].find(doublemetaphone(curse)[0]) != -1:
            boto['curse_count'] += 1
            return True
    return False
コード例 #6
0
    def __init__(self, ID, name, altNames, latitude, longitutde, country):
        # Unique ID for this city. Must be hashable.
        self.ID = ID
        # UTF-8 Name, stored in all-uppercase for simplicity.
        self.name = self.preprocess(name)
        # Alternate names, list of UTF-8 strings, all-uppercase (where applicable)
        self.altNames = []
        for altName in altNames:
            if (len(altName) > 0):
                self.altNames.append(self.preprocess(altName))

        # Also store original name, as written in the file.
        # If we decide to strip whitespace, punctuation, etc. want to be able to
        # return the actual name of city.
        self.origName = name

        self.latitude = latitude
        self.longitude = longitutde
        # ISO-3166 2-letter country code
        self.country = country

        # dic of phonetic (double metaphone) representation of city's name(s)
        # key is original name (with no preprocessing), value is phonetic name.
        self.phonetics = {}
        # Store metaphone representation of city's name and alt names.
        self.phonetics[self.origName] = doublemetaphone(self.origName)

        for name in altNames:
            self.phonetics[name] = doublemetaphone(name)
コード例 #7
0
def _addplayer(opteid, optrid, optplayer):
    """<eid> <rid> <player name>
    Adds a new player into the database.
    Needs a unique EID, RID, and playername (sanitized and parsed). DM will be calculated upon insertion.
    Ex: 2330 <RID> tom brady
    """

    # everything looks good so lets prep to add.  # 2330|1163|tom brady|tom|brady|TM||PRT|
    optplayer = _sanitizeName(optplayer)  # sanitize.
    namesplit = optplayer.split(
    )  # now we have to split the optplayer into first, last. (name needs to be parsed before)
    fndm = doublemetaphone(namesplit[0])  # dm first.
    lndm = doublemetaphone(namesplit[1])  # dm last.
    # connect to the db and finally add.
    with sqlite3.connect(DB) as db:
        try:
            cursor = db.cursor()
            cursor.execute(
                "INSERT INTO players VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)",
                (opteid, optrid, optplayer, namesplit[0], namesplit[1],
                 fndm[0], fndm[1], lndm[0], lndm[1]))
            db.commit()
            #return("I have successfully added player {0}({1}).".format(optplayer, opteid))
            return True
        except sqlite3.Error, e:
            print("ERROR: I cannot add {0}. Error: '{1}'".format(optplayer, e))
            return None
コード例 #8
0
def _rehashdm(eid):
    """<eid>
    Recalculate the doublemetaphone for a player (eid)
    Ex: 2330
    """

    with sqlite3.connect(DB) as db:
        cursor = db.cursor()
        cursor.execute("SELECT firstname, lastname FROM players WHERE eid=?",
                       (eid, ))
        row = cursor.fetchone()
    # calculate the dm on first,l ast
    fndm = doublemetaphone(row[0])
    lndm = doublemetaphone(row[1])
    # firstname and lastname are tuples.
    with sqlite3.connect(DB) as db:
        cursor = db.cursor()
        try:
            cursor.execute(
                "UPDATE players SET fndm1=?, fndm2=?, lndm1=?, lndm2=? WHERE eid=?",
                (
                    fndm[0],
                    fndm[1],
                    lndm[0],
                    lndm[1],
                    eid,
                ))
            db.commit()
            #return("I have successfully updated EID {0}'s doublemetaphone ({1}, {2})".format(eid, fndm, lndm))
            return True
        except sqlite3.Error, e:
            print(
                "ERROR: _rehashdm: I cannot update EID {0}'s doublemetaphone: '{1}'"
                .format(eid, e))
            return None
コード例 #9
0
def adapted_doublemetaphone(data, language="en"):
    '''
    Adapted function to take into account specific topics not considered in original version
    it accepts both strings and lists of strings
    '''
    if (isinstance(data, str)):
        list_data = [data]
        using_string = True
    else:
        list_data = data
        using_string = False
    #We perform the operation ina list, and then we return the result
    result = []
    for data2met in list_data:
        if (language == "es"):
            if not re.match(r"[Cc]h", data2met):
                data2met = re.sub(r"h", "",
                                  data2met.lower().replace("ph", "f"))
            #In spanish b and v are pronunced equally, if we know the language is spanish we shall remove!
            result.append(
                doublemetaphone(data2met.lower().replace("v", "b").replace(
                    "gi", "ji").replace("ge", "je").replace("ph", "f")))
        result.append(doublemetaphone(data2met))
    if using_string:
        return result[0]
    else:
        return result
コード例 #10
0
def check_is_metaphone(str_1: str, str_2: str) -> int:
    """Returns integer depending on if the two words are homophones.
    
    Returns 100 if the words are homophones. Otherwise 0.

    Args:
        str_1 (str): First text to compare.
        str_2 (str): Second text to compare.

    Returns:
        int: 100 if str_1 and str_2 are homophones. Else 0.
    
    Notes:

    - The doublemetaphone algorithm will return 100 for words with incorrect spacing.
    
        - e.g., California == Cali fornia
    """
    str_1_meta = doublemetaphone(str_1)
    str_2_meta = doublemetaphone(str_2)

    for e in str_1_meta:
        if e in str_2_meta and e != '':
            return 100  # 100 match
    return 0
コード例 #11
0
 def __create_phonetics__(self, word, misspelling_list):
     word_phon = doublemetaphone(word)[0]
     new_error_list = []
     for error in misspelling_list:
         error_phon = doublemetaphone(error)[0]
         if (len(error_phon) != 0 and (error_phon == word_phon)):
             new_error_list.append(error)
     return new_error_list, error_phon
コード例 #12
0
def homo_check(pair):
    word1 = pair[1][0]
    word2 = pair[1][1]
    if doublemetaphone(word1) == doublemetaphone(word2): return "homophone"
    elif stemlemma_check(word1, word2): return "inflection"
    elif simi_sound(word1, word2):
        if any(char.isdigit() for char in word1) or any(char.isdigit() for char in word2): return "number"
        else: return "quasi-homophone"
    else: return "None"
コード例 #13
0
ファイル: utils.py プロジェクト: vinitra/entity-matcher
def is_mispelling(token_left, token_right):

    dml = set(doublemetaphone(token_left))
    dmr = set(doublemetaphone(token_right))

    if len(dml.intersection(dmr).difference({''})) > 0:
        return True

    if ratio(token_left, token_right) >= 90:
        return True

    return False
コード例 #14
0
def compare_rhyme(word1, word2):
    w1 = list(doublemetaphone(word1))
    w2 = list(doublemetaphone(word2))

    for i in w1:
        for j in w2:
            if i == j:
                return True
            elif i[len(i) // 2:] == j[len(i) // 2:]:
                return True

    return False
コード例 #15
0
 def __hashing_names__(self):
     for i, row in self.df.iterrows():
         if not pd.isna(row['First Name']):
             #  self.df.at[i, 'First Name'] = jellyfish.soundex(row['First Name'])
             self.df.at[i, 'First Name'] = doublemetaphone(row['First Name'])[0]
         else:
             self.df.at[i, 'First Name'] = ''
         if not pd.isna(row['Last Name']):
             # self.df.at[i, 'Last Name'] = jellyfish.soundex(row['Last Name'])
             self.df.at[i, 'Last Name'] = doublemetaphone(row['Last Name'])[0]
         else:
             self.df.at[i, 'Last Name'] = ''
コード例 #16
0
def phonetic_score(query,cand):
    phn_query = doublemetaphone(query)
    phn_cand = doublemetaphone(cand)
    
    score = 1
    if phn_query[0] == phn_cand[0]:
        return 1000000
    elif phn_query[1] == phn_cand[0] or phn_query[0] == phn_cand[1]:
        return 1000
    elif phn_query[1] == phn_cand[1]:
        return 10
    else:
        return 1
    return score    
コード例 #17
0
def dlbMetaphone():
    print('Running Double Metaphone:...')

    strings = [
        'Ball Bearing', 'bll brng', 'Centrifugal', 'centrifigal', 'PUmp', 'pmp'
    ]

    i = 0
    for item in strings:
        rStr = item + str(i)
        print(item, '->', doublemetaphone(rStr))
        result = doublemetaphone(rStr)
        print('Double Metaphone [0]: ', result[0])
        print('Double Metaphone [1]: ', result[1])
コード例 #18
0
 def __hashing_names__(self):
     # Creates a metaphone for First Name and Last Name. We can also use soundex. (That code has been commented below)
     for i, row in self.df.iterrows():
         if not pd.isna(row['First Name']):
             #  self.df.at[i, 'First Name'] = jellyfish.soundex(row['First Name'])
             self.df.at[i, 'First Name'] = doublemetaphone(
                 row['First Name'])[0]
         else:
             self.df.at[i, 'First Name'] = ''
         if not pd.isna(row['Last Name']):
             # self.df.at[i, 'Last Name'] = jellyfish.soundex(row['Last Name'])
             self.df.at[i,
                        'Last Name'] = doublemetaphone(row['Last Name'])[0]
         else:
             self.df.at[i, 'Last Name'] = ''
コード例 #19
0
def get_double_metaphone(df, var):
    placeholder = []
    name_entity = [x.strip() for x in df[var].split(' ')]
    try:
        for i in name_entity:
            placeholder.append(doublemetaphone(i)[0])

            if len(doublemetaphone(i)[1]) > 0:
                placeholder.append(doublemetaphone(i)[1])

        liststring = ', '.join(map(str, placeholder))
        return liststring

    except:
        return ''
コード例 #20
0
ファイル: main.py プロジェクト: rechner/hamfursbot
def process_definition(message, term):
    term_db = mongo_client.hamfurs.definitions
    term = term.lower()

    definition = term_db.find_one({"index": term})
    if definition is None:
        # Search by any keyword value
        definition = term_db.find_one({"keywords": term})

    # Search by metaphone
    if definition is None:
        definition = term_db.find_one({"metaphone": doublemetaphone(term)})

    if definition is None:
        send_editable_message(
            message,
            "No definition for the given term found.\n(use /add\_definition to contribute one)",
        )
        return

    txt = "*{term}*: {definition}\n(Contributed by {contributor} _{last_edit}_)".format(
        **definition
    )
    send_editable_message(
        message, txt, parse_mode="Markdown", disable_web_page_preview=True
    )
    return
コード例 #21
0
ファイル: search.py プロジェクト: indirectlylit/kolibri
def fuzz(text):
    """
    Apply porter stemming algorithm then double metaphone algorithm to the passed in String
    to obtain normalized and misspelling tolerant hash values/tokens
    """
    processed_tokens = [doublemetaphone(stemmer.stem(word)) for word in text.split()]
    return [token for token in sum(processed_tokens, ()) if token]
コード例 #22
0
    def __init__(self):
        SpellChecker.dictCountMap = self.readDitionary('data/count_1w100k.txt')
        for key in SpellChecker.dictCountMap:
            SpellChecker.totalCount += SpellChecker.dictCountMap[key]
        for word in SpellChecker.dictCountMap:
            tGList = self.getGrams(word, SpellChecker.invertMapGram)
            for tgram in tGList:
                tmpWordList = []
                if tgram in SpellChecker.invertTriMap:
                    tmpWordList = SpellChecker.invertTriMap[tgram]
                tmpWordList.append(word)
                SpellChecker.invertTriMap[tgram] = tmpWordList
            tmpWordList = []

            # soundexHash = jellyfish.soundex(word)
            # if soundexHash in SpellChecker.invertSoundexMap:
            #		tmpWordList = SpellChecker.invertSoundexMap[soundexHash]
            # tmpWordList.append(word)
            # SpellChecker.invertSoundexMap[soundexHash] = tmpWordList

            # metaHash = jellyfish.metaphone(word)
            # if metaHash in SpellChecker.invertMetaMap:
            #		tmpWordList = SpellChecker.invertMetaMap[metaHash]
            # tmpWordList.append(word)
            # SpellChecker.invertMetaMap[metaHash] = tmpWordList

            metaHash = doublemetaphone(word)[0]
            if metaHash in SpellChecker.invertMetaMap:
                tmpWordList = SpellChecker.invertMetaMap[metaHash]
            tmpWordList.append(word)
            SpellChecker.invertMetaMap[metaHash] = tmpWordList
コード例 #23
0
ファイル: playerdb.py プロジェクト: jokker23/Supybot-NFL
def _rehashdm(eid):
    """."""

    with sqlite3.connect(DB) as db:
        cursor = db.cursor()
        cursor.execute("SELECT firstname, lastname FROM players WHERE eid=?", (eid,))
        row = cursor.fetchone()

    if not row:
        print "I did not find any player in the db with EID '{0}'".format(eid)
        return None
    else:
        firstname = doublemetaphone(row[0])
        lastname = doublemetaphone(row[1])
        print "DM :: FIRSTNAME {0} LASTNAME {1}".format(firstname, lastname)
        return dm
コード例 #24
0
ファイル: predicates.py プロジェクト: shahin/dedupe
def metaphoneToken(field):
    return {
        metaphone_token
        for metaphone_token in itertools.chain(
            *(doublemetaphone(token) for token in set(field.split())))
        if metaphone_token
    }
コード例 #25
0
ファイル: WikiActions.py プロジェクト: khouloudAbid25/hermod
async def search_word(word):
    logger = logging.getLogger(__name__)
    metaname = doublemetaphone(word)
    queryname = metaname[0] + metaname[1]
    logger.debug('SEARCH WORD')
    logger.debug([word, queryname])
    try:
        collection = mongo_connect_words()
        #query = {'_s_word':queryname}
        query = {"_s_word": {"$eq": queryname}}
        logger.debug(query)
        distances = []
        # logger.debug('SEARCH WORD A')
        # logger.debug(collection)
        async for document in collection.find(query):  #:
            # logger.debug('SEARCH WORD FOUND')
            # logger.debug(document)
            distance = lev.jaro_winkler(word, document.get('word'))
            distances.append({
                "word": document.get('word'),
                "distance": distance,
                "data": document
            })
        if len(distances) > 0:
            distances.sort(key=lambda x: x.get('distance'), reverse=True)
            logger.debug('SEARCH DIST LIST')
            logger.debug(distances)
            return distances[0].get('data')
        else:
            return None

    except:
        logger.debug('SEARCH WORD ERR')
        e = sys.exc_info()
        logger.debug(e)
コード例 #26
0
def build_similar_phonetic_words_db(lang, wordslist_path):
    with open(wordslist_path, 'rb') as wordslist_file:
        wordslist = [w.strip().lower() for w in wordslist_file.read().splitlines()]
    
    for word in tqdm(wordslist):

        phonetic_primary, phonetic_secondary = doublemetaphone(word)
        
        phonetic_word_primary = None
        phonetic_word_secondary = None

        if phonetic_primary:
            phonetic_word_primary = get_or_create(db.session,
                                                  PhoneticWord,
                                                  phonetic=phonetic_primary)
            
        if phonetic_secondary:
            phonetic_word_secondary = get_or_create(db.session,
                                              PhoneticWord,
                                              phonetic=phonetic_secondary)
        if phonetic_word_primary is not None or \
            phonetic_word_secondary is not None:

            lang_word = get_or_create(db.session,
                                      LangWord,
                                      lang=lang,
                                      word=word,
                                      phonetic_word_primary=phonetic_word_primary,
                                      phonetic_word_secondary=phonetic_word_secondary)
                
            
    db.session.commit()
コード例 #27
0
    def __init__(self,
                 actorType,
                 name,
                 locationID=None,
                 articleIDs=[],
                 connections={},
                 id=None,
                 db=None,
                 **kwds):
        '''
    Initializes an Actor object.
    '''
        self.actorType = actorType
        self.name = name
        metaphone_name = doublemetaphone(name)
        self._a_name = metaphone_name[0] + metaphone_name[1]
        self.locationID = locationID
        self.articleIDs = articleIDs

        # Dictionary to ActorConnections
        self.connections = connections

        if db:
            self._db = db
            self._collection = db[Actor._collectionKey]
            if not id:
                self.storeDB(db)
            else:
                self._mongoID = id
                self._id = str(id)
コード例 #28
0
 def get_dmetaphone_tokens(token):
     if len(token) > 2 and not any(i.isdigit() for i in token):
         misspellings = doublemetaphone(token)
         misspellings = [t for t in misspellings if t != ""]
     else:
         misspellings = []
     return misspellings
コード例 #29
0
def result():
    if request.method == 'POST':
        string_1 = request.form['text1']
        string_2 = request.form['text2']
        x_1 = string_1.lower()
        x_2 = string_2.lower()
        r_1 = fuzz.ratio(doublemetaphone(x_1)[0], doublemetaphone(x_2)[0])
        r_2 = fuzz.ratio(x_1, x_2)
        if r_1 > 90:
            return render_template("result.html", value=r_1)
        elif r_2 > 85:
            return render_template("result.html", value=r_2)
        elif doublemetaphone(x_1)[0] is doublemetaphone(x_2)[0] :
            return render_template("result.html", value=)
        else:
            return render_template("result.html", value=0)
コード例 #30
0
ファイル: make_db.py プロジェクト: Genji-MS/SnailMail-API
def insertName(name: str = "Mr. Sámuel Falledo"):
    """filters the inserted name string 'Kevin Meyers' and converts it, RETURNs the photetical doublemetaphone 'KFNMRS'"""

    #lowercase
    name = name.lower()  #doens't seem to be needed

    #remove accented letters
    name = unidecode(name)

    #remove honorifics?
    # this will work, but it removes everything before a period, not strictly honorifics
    regex = r"\w+\. *(?=\w+)|,[\s\w]*$"
    subst = ""
    name = re.sub(regex, subst, name, 0)

    #expand common english name abbreviations
    # https://en.wiktionary.org/wiki/Appendix:Abbreviations_for_English_given_names

    #sort alphabetically
    sort_name = name.split()
    sort_name.sort()
    name = ' '.join(sort_name)

    #double metaphone conversion
    nameTuple = doublemetaphone(name)

    return nameTuple
コード例 #31
0
def phonetic_rhyme(word: str, all_phonetics, thresh=10, alliteration=False):
    """This function returns a list of the closest phonetic matches from a given word based on the "phonetic_dist function"""

    # get phonetic and metaphone of word to be compared
    info = helper.get_by_id(word, word_relation_table)
    word_info = [info['id'], info['phonetic'], met.doublemetaphone(info['id'])[0]]
    print(word_info[1])
    matches = []

    # Compare distance between input word and all other viable words
    for i in range(len(all_phonetics)):

        current_word = all_phonetics[i]

        # only compares words that differ and long enough words
        if word_info[0] != current_word[0] and len(current_word[0]) > 3:

            phon_dist = phonetic_dist(word_info[1], current_word[1], alliteration)

            # while matches is not full, populate list
            if len(matches) < thresh:
                matches.append({"word": current_word[0], "d": phon_dist, "Phon": current_word[1]})
            else:
                if matches[thresh - 1]["d"] > phon_dist:
                    matches[thresh - 1] = {"word": current_word[0], "d": phon_dist, "Phon": current_word[1]}

        matches = sorted(matches, key=lambda k: k['d'])

    return matches
コード例 #32
0
def build_roman_dmeta(corpus):
    """To build and dump double metaphone dictionary to Redis DB."""
    from metaphone import doublemetaphone
#     import redis
    dmeta = {}
    for word in corpus:
        # print n
        # print word
        word = word.lower()
        if word:
            code = doublemetaphone(word)
            if code[0] not in dmeta:
                dmeta[code[0]] = [word]
            else:
                if word not in dmeta[code[0]]:
                    dmeta[code[0]].append(word)
#     print dmeta
#     values = dict(dmeta[value].most_common())
    # print values

#     r = redis.Redis(host='localhost', port=6379, db=1)
    r = load_redis(1)
    for dummy in dmeta.iteritems():
        # print dummy
        # print "testing", ",".join(dummy[1])
        temp2 = {dummy[0]: ",".join(dummy[1])}
#         temp2 = {dummy[0]: dummy[1]}
        r.hmset('dmeta', temp2)
    return
コード例 #33
0
ファイル: epnames.py プロジェクト: jrladd/front_matter
def fingerprint(name):
    """
    A fingerprinting function that mimics some of the disambiguation features of
    OpenRefine. Originally I intended to use this in place of OpenRefine, but that
    proved less reliable and efficient than using OpenRefine directly. Now this function
    is used primarily to compare possible author names.
    """
    testname = name
    if re.search(r"\bjesu", testname.lower()) != None or re.search(
            r"\bchrist\b", testname.lower()) != None:
        return ("christ", "")
    else:
        testname = re.sub(r"\band\b|\bor\b", "", testname)
        for k, v in abbrev.items():
            patt = f"\\b{k.lower()}\.?\\b"
            testname = re.sub(patt, v.lower(), testname.lower())
        testname = re.sub(r"\bfl\b|\bof\b|\bd\.|\bb\.", "", testname)
        testname = re.sub(r"[\.\?,!;:\(\)\-\[\]]|'s", "", testname)
        testname = re.sub(r"\d+", "", testname)
        if " " in testname:
            testname = "".join(sorted(testname.split()))
        dm = doublemetaphone(testname)
        if dm[1] == '' and re.search(r"[aiouy]$", testname.lower()) != None:
            return (dm[0], f"{dm[0]}{testname[-1].upper()}")
        elif dm[1] == '' and re.search(r"^b", testname.lower()) != None:
            return (dm[0], f"{testname[0].upper()}{dm[0][1:]}")
        else:
            return dm
コード例 #34
0
ファイル: fuzzymetaphone.py プロジェクト: maxharlow/csvmatch
def match(data1, data2, fields1, fields2):
    data1phonetic = {key: {field: metaphone.doublemetaphone(data1[key][field]) for field in data1[key]} for key in data1}
    data2phonetic = {key: {field: metaphone.doublemetaphone(data2[key][field]) for field in data2[key]} for key in data2}
    matches = []
    for data1key, data1values in data1phonetic.items():
        for data2key, data2values in data2phonetic.items():
            match = True
            for field1, field2 in zip(fields1, fields2):
                possibilities = [
                    data1values.get(field1)[0] == data2values.get(field2)[0],
                    data1values.get(field1)[0] == data2values.get(field2)[1],
                    data1values.get(field1)[1] == data2values.get(field2)[0],
                    data1values.get(field1)[1] == data2values.get(field2)[1] != ''
                ]
                if True not in possibilities: match = False
            if match: matches.append((data1key, data2key))
    return matches
コード例 #35
0
ファイル: playerdb.py プロジェクト: jokker23/Supybot-NFL
def _addplayer(opteid, optrid, optplayer):
    """<eid> <rid> <player name> adds a new player into the database."""

    # everything looks good so lets prep to add.  # 2330|1163|tom brady|tom|brady|TM||PRT|
    optplayer = _sanitizeName(optplayer)  # sanitize.
    namesplit = optplayer.split()  # now we have to split the optplayer into first, last.
    fndm = doublemetaphone(namesplit[0])  # dm first.
    lndm = doublemetaphone(namesplit[1])  # dm last.
    # connect to the db and finally add.
    with sqlite3.connect(DB) as db:
        try:
            cursor = db.cursor()
            cursor.execute("INSERT INTO players VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)", (opteid, optrid, optplayer, namesplit[0], namesplit[1], fndm[0], fndm[1], lndm[0], lndm[1]))
            db.commit()
            return("I have successfully added player {0}({1}).".format(optplayer, opteid))
        except sqlite3.Error, e:
            return("ERROR: I cannot add {0}. Error: '{1}'".format(optplayer, e))
コード例 #36
0
ファイル: __init__.py プロジェクト: oubiwann/tharsk
def getMetaphones(wordList):
    metaphones = []
    if isinstance(wordList, basestring):
        wordList = wordList.split()
    for word in wordList:
        metaphones.extend(list(metaphone.doublemetaphone(
            normalizeUnicode(word))))
    return sorted([x for x in set(metaphones) if x])
コード例 #37
0
ファイル: metaphoneDict.py プロジェクト: jxjzhang/CS246
def create_metaphone_dict(fp):
		words=(word for line in fp for word in line.split())
		words=imap(lambda w: w.lower(), words)
		metaphoneDict={}
		for word in words:
				word2=purifyWord_deep(word)
				if word not in metaphoneDict:
						metaphoneDict[word]=doublemetaphone(word2)
		return metaphoneDict
コード例 #38
0
 def create_soundalikes_wordlist(self):
   words = {}
   wordlist = open(self.input_file, 'r')
   for line in wordlist:
     word = string.rstrip(line)
     phonemes = doublemetaphone(word)[0]
     word_info = words.setdefault(phonemes, self.__default_word_info())
     word_info['graphemes'].append(word)
     self.__mark_inappropriate(word_info, word)
   wordlist.close()
   self.__export_wordlist(words)
コード例 #39
0
ファイル: dupable.py プロジェクト: lino-framework/lino
 def reduce_word(cls, s):
     # from metaphone.word import Word
     import metaphone as fuzzy
     # fuzzy.DMetaphone does not work with unicode strings, see
     # https://bitbucket.org/yougov/fuzzy/issue/2/fuzzy-support-for-unicode-strings-with
     # dm = fuzzy.doublemetaphone(s.encode('utf8'))
     dm = fuzzy.doublemetaphone(s)
     dms = dm[0] or dm[1]
     if dms is None:
         return ''
     if isinstance(dms, six.binary_type):
         dms = dms.decode('utf8')
     return dms
コード例 #40
0
def metaphone(a):
    """
    Determine the (double) metaphone.
    """

    #print("Metaphone value: %-10s : dmetaphone(%s)" % (a, dmetaphone(a)))
    
    #print(doublemetaphone("architect"))
    #("ARKTKT", "")
    #print(doublemetaphone("bajador"))
    #("PJTR", "PHTR")

    return doublemetaphone(a)
コード例 #41
0
ファイル: playerdb.py プロジェクト: nvuono/Supybot-NFL
def _addplayer(opteid, optrid, optplayer):
    """<eid> <rid> <player name>
    Adds a new player into the database.
    Needs a unique EID, RID, and playername (sanitized and parsed). DM will be calculated upon insertion.
    Ex: 2330 <RID> tom brady
    """

    # everything looks good so lets prep to add.  # 2330|1163|tom brady|tom|brady|TM||PRT|
    optplayer = _sanitizeName(optplayer)  # sanitize.
    namesplit = optplayer.split()  # now we have to split the optplayer into first, last. (name needs to be parsed before)
    fndm = doublemetaphone(namesplit[0])  # dm first.
    lndm = doublemetaphone(namesplit[1])  # dm last.
    # connect to the db and finally add.
    with sqlite3.connect(DB) as db:
        try:
            cursor = db.cursor()
            cursor.execute("INSERT INTO players VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)", (opteid, optrid, optplayer, namesplit[0], namesplit[1], fndm[0], fndm[1], lndm[0], lndm[1]))
            db.commit()
            #return("I have successfully added player {0}({1}).".format(optplayer, opteid))
            return True
        except sqlite3.Error, e:
            print("ERROR: I cannot add {0}. Error: '{1}'".format(optplayer, e))
            return None
コード例 #42
0
ファイル: cerca.py プロジェクト: shlomihod/cerca
def get_all_similar_phonetic_words(dst_lang, src_word):

    similar_phonetic_words = []

    src_word = preprocess_es(src_word)
    phonetic_primary, phonetic_secondary = doublemetaphone(src_word)

    if phonetic_primary:
        similar_phonetic_words = get_similar_phonetic_words(dst_lang, phonetic_primary)

    if (not similar_phonetic_words or not phonetic_primary) and phonetic_secondary:
        similar_phonetic_words = get_similar_phonetic_words(dst_lang, phonetic_secondary)

    return [lang_word.word for lang_word in similar_phonetic_words]
コード例 #43
0
def tokenFeatures(token) :

    if token in (u'&') :
        token_clean = token_abbrev = token
        
    else :
        token_clean = re.sub(r'(^[\W]*)|([^.\w]*$)', u'', token.lower())
        token_abbrev = re.sub(r'\W', u'', token_clean)

    metaphone = doublemetaphone(token_abbrev)

    features = {'nopunc' : token_abbrev,
                'abbrev' : token_clean.endswith('.'),
                'comma'  : token.endswith(','), 
                'hyphenated' : '-' in token_clean,
                'contracted' : "'" in token_clean,
                'bracketed' : bool(re.match(r'(["(\']\w+)|(\w+[")\'])', token) and not re.match(r'["(\']\w+[")\']', token)),
                'fullbracketed' : bool(re.match(r'["(\']\w+[")\']', token)),
                'length' : len(token_abbrev),
                'initial' : len(token_abbrev) == 1 and token_abbrev.isalpha(),
                'has.vowels'  : bool(set(token_abbrev[1:]) & set(VOWELS_Y)),
                'just.letters' : token_abbrev.isalpha(),
                'roman' : set('xvi').issuperset(token_abbrev),
                'endswith.vowel' : token_abbrev.endswith(VOWELS_Y),
                'digits' : digits(token_abbrev),
                'metaphone1' : metaphone[0],
                'metaphone2' : (metaphone[1] if metaphone[1] else metaphone[0]),
                'more.vowels' : vowelRatio(token_abbrev),
                'in.names' : token_abbrev.upper() in ratios,
                'prepositions' : token_abbrev in PREPOSITIONS,
                'first.name' : ratios.get(token_abbrev.upper(), 0),
                'gender_ratio' : gender_names.get(token_abbrev, False),
                'possessive' : token_clean.endswith("'s") 
                }

    reversed_token = token_abbrev[::-1]
    for i in range(1, len(token_abbrev)) :
        features['prefix_%s' % i] = token_abbrev[:i]
        features['suffix_%s' % i] = reversed_token[:i][::-1]
        if i > 4 :
            break

    for tri_gram in ngrams(token_abbrev, 3) :
        features[tri_gram] = True

    for four_gram in ngrams(token_abbrev, 4) :
        features[four_gram] = True

    return features
コード例 #44
0
ファイル: playerdb.py プロジェクト: nvuono/Supybot-NFL
def _rehashdm(eid):
    """<eid>
    Recalculate the doublemetaphone for a player (eid)
    Ex: 2330
    """

    with sqlite3.connect(DB) as db:
        cursor = db.cursor()
        cursor.execute("SELECT firstname, lastname FROM players WHERE eid=?", (eid,))
        row = cursor.fetchone()
    # calculate the dm on first,l ast
    fndm = doublemetaphone(row[0])
    lndm = doublemetaphone(row[1])
    # firstname and lastname are tuples.
    with sqlite3.connect(DB) as db:
        cursor = db.cursor()
        try:
            cursor.execute("UPDATE players SET fndm1=?, fndm2=?, lndm1=?, lndm2=? WHERE eid=?", (fndm[0], fndm[1], lndm[0], lndm[1], eid,))
            db.commit()
            #return("I have successfully updated EID {0}'s doublemetaphone ({1}, {2})".format(eid, fndm, lndm))
            return True
        except sqlite3.Error, e:
            print("ERROR: _rehashdm: I cannot update EID {0}'s doublemetaphone: '{1}'".format(eid, e))
            return None
コード例 #45
0
ファイル: name.py プロジェクト: alexk307/last_name
def suggest_name(name):
    """
    Suggest names based on a name
    :param name: The name to generate matches on
    :return:
    """
    name = name.upper()
    data = names
    dmeta = doublemetaphone(name)[0]

    temp_data = {}
    if dmeta not in data:
        return [()]
    else:
        for lname in data[dmeta]:
            temp_data[lname] = levenshtein_distance(name, lname)
    return sorted(temp_data.items(), key=lambda x: x[1])
コード例 #46
0
ファイル: Model.py プロジェクト: esitarski/CallupSeedingMgr
	def match_indices( self, search, indices ):
		# Look for a set intersection of one element between all source criteria.
		
		if self.debug: print ( 'match_indices: searchKeys=', indices )
		
		soundalike = False
		setCur = None
		for idx_name in indices:
			if self.debug: print ( "match_indices: matching on key:", idx_name )
			idx = getattr( self, idx_name )
			v = getattr( search, self.field_from_index(idx_name), None )
			if not v or not idx:
				setCur = None
				if self.debug: print ( 'match_indices: missing attribute' )
				break

			try:
				v = normalize_name_lookup( v )
			except:
				pass
				
			if self.debug: print ( 'match_indices: value=', v )
			
			found = set()
			if idx_name.startswith( 'by_mp_' ):
				soundalike = True
				for mp in doublemetaphone(v.replace('-','').encode('utf8')):
					if mp and mp in idx:
						found |= set(idx[mp])
			elif v in idx:
				found = set(idx[v])
			
			if setCur is None:
				setCur = set(found)
			else:
				setCur &= set(found)
			
			if not setCur:
				if self.debug: print ( "match_indices: match failed. found=", found )
				break
			
			if self.debug: print ( "matched:", setCur )
		
		return FindResult( search, setCur, self, soundalike )
コード例 #47
0
ファイル: build.py プロジェクト: jsfenfen/python-us
def pickle_data():

    dbpath = os.path.abspath(os.path.join(PWD, 'data.db'))

    conn = sqlite3.connect(dbpath)
    conn.row_factory = dict_factory

    c = conn.cursor()
    c.execute("""SELECT * FROM states ORDER BY name""")

    states = []

    for row in c:
        row['name_metaphone'] = doublemetaphone(row['name'])[0]
        row['is_territory'] = row['is_territory'] == 1
        states.append(row)

    pkl_path = os.path.abspath(os.path.join(PWD, 'us', 'states.pkl'))

    with open(pkl_path, 'wb') as pkl_file:
        pickle.dump(states, pkl_file)
コード例 #48
0
ファイル: states.py プロジェクト: jsfenfen/python-us
def lookup(val, field=None, use_cache=True):
    """ Semi-fuzzy state lookup. This method will make a best effort
        attempt at finding the state based on the lookup value provided.

          * two digits will search for FIPS code
          * two letters will search for state abbreviation
          * anything else will try to match the metaphone of state names

        Metaphone is used to allow for incorrect, but phonetically accurate,
        spelling of state names.

        Exact matches can be done on any attribute on State objects by passing
        the `field` argument. This skips the fuzzy-ish matching and does an
        exact, case-sensitive comparison against the specified field.

        This method caches non-None results, but can the cache can be bypassed
        with the `use_cache=False` argument.
    """

    from metaphone import doublemetaphone

    if field is None:
        if FIPS_RE.match(val):
            field = 'fips'
        elif ABBR_RE.match(val):
            val = val.upper()
            field = 'abbr'
        else:
            val = doublemetaphone(val)[0]
            field = 'name_metaphone'

    # see if result is in cache
    cache_key = "%s:%s" % (field, val)
    if use_cache and cache_key in _lookup_cache:
        return _lookup_cache[cache_key]

    for state in STATES_AND_TERRITORIES:
        if val == getattr(state, field):
            _lookup_cache[cache_key] = state
            return state
コード例 #49
0
ファイル: __init__.py プロジェクト: ahjohns/probablepeople
def tokenFeatures(token) :

    if token in (u'&') :
        token_clean = token_abbrev = token
        
    else :
        token_clean = re.sub(r'(^[\W]*)|([^.\w]*$)', u'', token)
        token_abbrev = re.sub(r'\W', u'', token_clean.lower())

    metaphone = doublemetaphone(token_abbrev)

    features = {'nopunc' : token_abbrev,
                'abbrev' : token_clean.endswith('.'),
                'comma'  : token.endswith(','), 
                'hyphenated' : '-' in token_clean,
                'contracted' : "'" in token_clean,
                'bracketed' : bool(re.match(r'["(\']\w+[")\']', token)),
                'length' : len(token_abbrev),
                'initial' : len(token_abbrev) == 1 and token_abbrev.isalpha(),
                'has.vowels'  : bool(set(token_abbrev[1:]) & set(VOWELS_Y)),
                'roman' : set('xvi').issuperset(token_abbrev),
                'endswith.vowel' : token_abbrev.endswith(VOWELS_Y),
                'metaphone1' : metaphone[0],
                'metaphone2' : (metaphone[1] if metaphone[1] else metaphone[0]),
                'more.vowels' : vowelRatio(token_abbrev),
                'in.names' : float(token_abbrev.upper() in ratios),
                'first.name' : float(ratios.get(token_abbrev.upper(), 0)),
                'possessive' : token_clean.endswith("'s") 
                }

    reversed_token = token_abbrev[::-1]
    for i in range(1, len(token_abbrev)) :
        features['prefix_%s' % i] = token_abbrev[:i]
        features['suffix_%s' % i] = reversed_token[:i][::-1]
        if i > 4 :
            break

    return features
コード例 #50
0
ファイル: Model.py プロジェクト: esitarski/CallupSeedingMgr
	def add( self, result ):
		self.results.append( result )
		
		'''
		'by_license', 'by_uci_id',
		'by_last_name', 'by_first_name',
		'by_mp_last_name', 'by_mp_first_name',
		'by_nation_code', 'by_date_of_birth', 'by_age',
		'''
		
		for field in Result.Fields:
			if getattr( result, field, None ):
				self.hasField.add( field )
		
		for idx_name in self.Indices:
			field = self.field_from_index(idx_name)
			v = getattr( result, field, None )
			if not v:
				continue
			idx = getattr( self, idx_name )			
			if idx_name.startswith( 'by_mp_' ):	# Initialize a doublemetaphone (soundalike) index.
				for mp in doublemetaphone(v.replace('-','').encode('utf8')):
					if mp:
						try:
							idx[mp].append( result )
						except KeyError:
							idx[mp] = [result]
			else:								# Initialize a regular field index.
				assert idx_name != 'by_license' or v not in idx, 'Duplicate license: {}'.format(v)
				try:
					key = normalize_name_lookup(v)
				except:
					key = v					
				try:
					idx[key].append( result )
				except KeyError:
					idx[key] = [result]
コード例 #51
0
from metaphone import doublemetaphone
filename = 'dict.txt'
file_content = open(filename,'r').read().split()
#print file_content

file_out = "dict1"
file_output = open(file_out,'w')

for word in file_content:
	word_list = doublemetaphone(word)
	output = str(word_list[0]+"\n"+word_list[1]+"\n")
	file_output.write(output)
	
コード例 #52
0
ファイル: datagen.py プロジェクト: greinerb/Fuzzgo
def soundslike(word):
    if word is None:
        return None
    t = doublemetaphone(word)
    return t[0]+t[1]
コード例 #53
0
def get_metaphone_from_word(word):
  return doublemetaphone(word)[0] if len(doublemetaphone(word)[0]) > 1 else doublemetaphone(word)[1]
コード例 #54
0
ファイル: Model.py プロジェクト: esitarski/CallupSeedingMgr
	def add_source( self, source ):
		self.sources.append( source )
		
if __name__ == '__main__':
	s = Source( 'CallupTest.xlsx', '2014 Result' )
	# errors = s.read( GetExcelReader(self.fname) )
	print ( s.by_mp_last_name )
	sys.exit()
	
	#for r in s.results:
	#	print ( r )
	for k, v in sorted( ((k, v) for k, v in s.by_mp_last_name.items()), key=operator.itemgetter(0) ):
		print ( '{}: {}'.format(k, ', '.join( Utils.removeDiacritic(r.full_name) for r in v )) )
	for k, v in sorted( ((k, v) for k, v in s.by_mp_first_name.items()), key=operator.itemgetter(0) ):
		print ( '{}: {}'.format(k, ', '.join( Utils.removeDiacritic(r.full_name) for r in v )) )
		
	for r in s.results:
		for p_last in doublemetaphone(r.last_name.replace('-','').encode('utf8')):
			if not p_last:
				continue
			p_last_set = s.by_mp_last_name[p_last]
			for p_first in doublemetaphone(r.first_name.replace('-','').encode('utf8')):
				p_first_set = s.by_mp_first_name[p_first]
				p_last_first_set = p_last_set & p_first_set
				if len(p_last_first_set) > 1:
					print ( ', '.join( u'({}, {}, {})'.format(
							Utils.removeDiacritic(rr.full_name), Utils.removeDiacritic(rr.nation_code), rr.age,
						)
						for rr in p_last_first_set ) )

コード例 #55
0
ファイル: fuzzgo.py プロジェクト: greinerb/Fuzzgo
def soundslike(word):
    t = doublemetaphone(word)
    return t[0]+t[1]
コード例 #56
0
ファイル: Model.py プロジェクト: esitarski/CallupSeedingMgr
def soundalike_match( s1, s2 ):
	dmp1 = doublemetaphone( s1.replace('-','').encode('utf8') )
	dmp2 = doublemetaphone( s2.replace('-','').encode('utf8') )
	return any( v in dmp1 for v in dmp2 )
コード例 #57
0
from metaphone import doublemetaphone

string = 'data/prior.txt'
with open(string, 'rb') as f:
	prior = pickle.load(f)
while True:
	# print "Please give the word:"
	input = raw_input()
	input = input.upper()
	start = time.time()
	cands = trie.search(input, 4)
	# print "Got results from trie" + str(len(cands))
	edit = {}
	prob = {}
	for i in cands:
		input_ph = doublemetaphone(input)
		word_ph = doublemetaphone(i[0])
		maxphval = -1
		
		if input_ph[0]!='' and word_ph[0]!='':
		    phonetic_val = lev.distance(input_ph[0],word_ph[0])
		    if maxphval<phonetic_val:
		        maxphval = phonetic_val
		
		if input_ph[0]!='' and word_ph[1]!= '':
		    phonetic_val = lev.distance(input_ph[0],word_ph[1])
	            if maxphval<phonetic_val:
		        maxphval = phonetic_val
		        
		if input_ph[1]!= '' and word_ph[0]!='':
		    phonetic_val = lev.distance(input_ph[1],word_ph[0])
コード例 #58
0
ファイル: predicates.py プロジェクト: 01-/dedupe
def doubleMetaphone(field) :
    return [metaphone for metaphone in doublemetaphone(field) if metaphone]
コード例 #59
0
ファイル: predicates.py プロジェクト: 01-/dedupe
def metaphoneToken(field) :
    return {metaphone_token for metaphone_token 
            in itertools.chain(*(doublemetaphone(token) 
                                 for token in set(field.split())))
            if metaphone_token}