Beispiel #1
0
def initializeSoundexDict(freqDict):
    soundexScore = Soundex().soundex
    freqDictSoundex = {}
    for c in freqDict:
        freqDictSoundex[c] = soundexScore(c.capitalize())

    return freqDictSoundex
Beispiel #2
0
def soundex(word):
    word = word.capitalize()
    sound_candidates = []
    soundexScore = Soundex().soundex
    wordScore = soundexScore(word)
    for c in freqDict:
        if (wordScore == freqDictSoundex[c]):
            sound_candidates.append(c)

    if len(sound_candidates) == 0:
        sound_candidates.append(word)

    words = []
    for i in range(min(len(sound_candidates), candidates_nb)):
        words.append(max(sound_candidates, key=P))
        sound_candidates.remove(words[i])
    return words
Beispiel #3
0
def has_player_name_fuzzy(current, edu):
    "if the EDU has a word that sounds like a player name"
    tokens = edu.tokens
    soundex = lambda w: Soundex().soundex(w)
    return has_one_of_words(current.players, tokens, norm=soundex)
    phonemes2 = np.array([ipa[c] for c in pad2])
    features = {'word 1 encoding': phonemes1, 'word 2 encoding': phonemes2}
    return features


#%% Open some things
if __name__ == '__main__':
    TRAIN_PATH = '../data/cognet_train.csv'
    TEST_PATH = '../data/cognet_test.csv'
    DEV_PATH = '../data/cognet_dev.csv'
    DATA_PATH = '../data/extracted_features.npy'
    SUPPORTED_LANGS_PATH = '../data/cognet_supported_langs.tsv'
    IPA_ENCODING_PATH = '../data/ipa_encodings.pickle'

    v = DictVectorizer(sparse=False)
    soundex = Soundex()
    epitran_dict = create_epitran_dict()
    with open(IPA_ENCODING_PATH, 'rb') as f:
        ipa = pickle.load(f)
    ipa = defaultdict(lambda: np.array([0.] * 24), ipa)

#%% FEATURE EXTRACTION
if __name__ == '__main__':
    print('Reading training data...')
    train_data = pd.read_csv(TRAIN_PATH)
    print('Extracting features...')
    x_train = v.fit_transform([
        extract_features(str(lang1), str(word1), str(lang2), str(word2))\
            for lang1, word1, lang2, word2 in\
                zip(train_data['lang 1'], train_data['translit 1'], train_data['lang 2'], train_data['translit 2'])
                ])
# -*- coding: utf-8 -*-
"""
Created on Sun Jun 18 00:04:21 2017

@author: Milton

"""

from foneticaBR import foneticaBR
from buscabr import buscaBR
from metaphoneBR import metaphoneBR

# Soundex - Instalar no Python as bibliotecas soundex e silpa_common

from soundex import Soundex

chaveRoberto = foneticaBR()
chavebr = buscaBR()
chavemeta = metaphoneBR()
chavesoundex = Soundex()

texto = 'JOSSEPH'
print(chaveRoberto.chavefonetica(texto))
print(chavebr.chaveBR(texto, False))
print(chavebr.chaveBR(texto, True))
print(chavemeta.chaveMetaphoneBR(texto))
print(chavesoundex.soundex(texto))
Beispiel #6
0
 def sndx(s):
     return Soundex().soundex(stemmer(s, args.stem))
Beispiel #7
0
from soundex import Soundex
from collections import defaultdict
from wordfreq import word_frequency

soundex = Soundex().soundex

sound_words = defaultdict(set)

with open('eff_short_wordlist_1.txt','r') as fh:
    for line in fh:
        word = line.split()[1]
        sound = soundex(word)
        if len(sound) > 1: # and sound not in ('i245', 't651'):
            sound_words[sound].add(word)

for word_set in sound_words.values():
    if len(word_set) > 1:
        word_list = [ (word_frequency(word, 'en'), word) for word in word_set ]
        word_list.sort()
        print(word_list[-1][-1])
    else:
        print(list(word_set)[0])
Beispiel #8
0
    'ben': SCHEMES[BENGALI],
    'guj': SCHEMES[GUJARATI],
    'hin': SCHEMES[DEVANAGARI],
    'kan': SCHEMES[KANNADA],
    'mal': SCHEMES[MALAYALAM],
    'mar': SCHEMES[DEVANAGARI],
    "ori": SCHEMES[ORIYA],
    'pun': SCHEMES[GURMUKHI],
    'tam': SCHEMES[TAMIL],
    'tel': SCHEMES[TELUGU],
    'urd': SCHEMES[DEVANAGARI]
}
if lang in scheme_dict:
    src_scheme = scheme_dict[lang]
scheme_map = SchemeMap(src_scheme, SCHEMES[HK])
instance = Soundex()

findings = {}
for i, line in enumerate(inp_bible):
    curr_line_id = 23146 + i
    if line == "" or line == "\n":
        continue

    for index, name in enumerate(names_reference):
        lids = ast.literal_eval(name[lid_col])
        for col in col_references:
            romans = name[col]
            if romans == "":
                continue

            print(".", end="")