def test_sonority_sequencing_syllable_tokenizer(self): """ Test SyllableTokenizer tokenizer. """ tokenizer = SyllableTokenizer() tokens = tokenizer.tokenize('justification') self.assertEqual(tokens, ['jus', 'ti', 'fi', 'ca', 'tion'])
def test_sonority_sequencing_syllable_tokenizer(self): """ Test SyllableTokenizer tokenizer. """ tokenizer = SyllableTokenizer() tokens = tokenizer.tokenize("justification") assert tokens == ["jus", "ti", "fi", "ca", "tion"]
def create_vocabulary(vocabulary_path, data_path, max_vocabulary_size, tokenizer=None, normalize_digits=True): """Create vocabulary file (if it does not exist yet) from data file. Data file is assumed to contain one sentence per line. Each sentence is tokenized and digits are normalized (if normalize_digits is set). Vocabulary contains the most-frequent tokens up to max_vocabulary_size. We write it to vocabulary_path in a one-token-per-line format, so that later token in the first line gets id=0, second line gets id=1, and so on. Args: vocabulary_path: path where the vocabulary will be created. data_path: data file that will be used to create vocabulary. max_vocabulary_size: limit on the size of the created vocabulary. tokenizer: a function to use to tokenize each data sentence; if None, basic_tokenizer will be used. normalize_digits: Boolean; if true, all digits are replaced by 0s. """ if not gfile.Exists(vocabulary_path): print("Creating vocabulary %s from data %s" % (vocabulary_path, data_path)) vocab = {} trans_list = [ "ai", "ay", "ei", "ea", "ey", "ee", "ew", "eu", "oo", "oa", "ou", "ie", "ch", "ck", "tc", "bt", "gh", "dg", "th", "sh", "gn", "mb", "mn", "kn", "wh", "ng", "ph", "wr", "er", "or", "an", "al", "wa", "or", "ar", "ig", "qu", "il", "in", "al", "ow", "oy", "au" ] with gfile.GFile(data_path, mode="rb") as f: counter = 0 for line in f: counter += 1 if counter % 100000 == 0: print(" processing line %d" % counter) line = tf.compat.as_bytes(line) tokens = tokenizer(line) if tokenizer else basic_tokenizer( line) for w in tokens: word = _DIGIT_RE.sub( b"0", tf.compat.as_bytes(w)) if normalize_digits else w if word in vocab: vocab[word] += 1 else: vocab[word] = 1 #vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True) vocab_list = _START_VOCAB + sorted(trans_list) if len(vocab_list) > max_vocabulary_size: vocab_list = vocab_list[:max_vocabulary_size] with gfile.GFile(vocabulary_path, mode="wb") as vocab_file: for w in vocab_list: vocab_file.write(tf.compat.as_bytes(w) + b"\n") #vocab_file.write(w + b"\n") SSP = SyllableTokenizer() print("mem is: ", SSP.tokenize('justification'))
def syllabify_orth_with_nltk(token, num_sylls=None): global nltk_ssp if not nltk_ssp: from nltk.tokenize import SyllableTokenizer nltk_ssp = SyllableTokenizer() l = nltk_ssp.tokenize(token) return l
def getSyllables(request, text): textSyllables = [] SSP = SyllableTokenizer() tokenised_sentences = nltk.sent_tokenize(text) for sentence in tokenised_sentences: tokenised_words = nltk.word_tokenize(sentence) #tagged_words = nltk.pos_tag(tokenised_words) for word in tokenised_words: tokenised_syllables = SSP.tokenize(word) #textSyllables = textSyllables.join(tokenised_syllables) textSyllables += tokenised_syllables return JsonResponse(textSyllables, safe=False)
def syllableCount(cleansedWordsList): wordCountThisDocument = 0 totalSyllsThisDocument = 0 SSP = SyllableTokenizer() for words in cleansedWordsList: numSyllThisWord = len(SSP.tokenize(words.lower())) if numSyllThisWord > 0 and words: # Run if number of syllables > 0 and list entry not null wordCountThisDocument = wordCountThisDocument + 1 totalSyllsThisDocument = totalSyllsThisDocument + numSyllThisWord if wordCountThisDocument > 0: averageNumSyllables = totalSyllsThisDocument / wordCountThisDocument else: averageNumSyllables = "N/A" averageNumSyllableList.append(averageNumSyllables)
def syllabify_orth_with_nltk(token,num_sylls=None): global nltk_ssp if not nltk_ssp: from nltk.tokenize import SyllableTokenizer nltk_ssp = SyllableTokenizer() tokenl=token.lower() l = nltk_ssp.tokenize(tokenl) if tokenl!=token: o=[] i=0 for x in l: xlen=len(x) o+=[token[i:i+xlen]] i+=xlen l=o return l
def __init__(self, dictName='-', lang=None): # Sonority Sequencing Tokenizer defaults to 26 latin letters, # english pronunciation. self.SSP = SyllableTokenizer() self.changeLang(lang) self.dict = {"words": []} if (dictName == '-'): dictName = os.path.dirname(__file__) + "/dict.yaml" try: with open(dictName) as f: self.dict = yaml.safe_load(f) except BaseException: error("%s could not be loaded." % dictName) # CMU Pronunciation dictionary includes 119K+ english words plus some # proper nouns using the latin alphabet, occasionally with punctuation. self.d = cmudict.dict()
def run(words): # parts = [decompose(x, stub=4) for x in words] SSP = SyllableTokenizer() parts = [SSP.tokenize(x) for x in words] portmanteau = stitch("", parts) return portmanteau
import re from num2words import num2words from typing import List, Any import nltk from nltk.corpus import cmudict from nltk.tokenize import SyllableTokenizer import wordninja # nltk.download('cmudict') pro_dict = cmudict.dict() ssp = SyllableTokenizer() def count_syllables(words: List[str]) -> set: word_syl_dict = {word: [] for word in words} for word in words: syllable_nums = cmudict_syl(word) if syllable_nums is False: syllable_nums = [ssp_syl(word)] word_syl_dict[word] = syllable_nums total_syl_set = set() recur_count_syl(0, word_syl_dict, words.copy(), total_syl_set) return total_syl_set def recur_count_syl(syl_count: int, word_syl_dict: dict, words: List[str], total_syl_set: set): while words:
class MultiSylT(object): def __init__(self, dictName='-', lang=None): # Sonority Sequencing Tokenizer defaults to 26 latin letters, # english pronunciation. self.SSP = SyllableTokenizer() self.changeLang(lang) self.dict = {"words": []} if (dictName == '-'): dictName = os.path.dirname(__file__) + "/dict.yaml" try: with open(dictName) as f: self.dict = yaml.safe_load(f) except BaseException: error("%s could not be loaded." % dictName) # CMU Pronunciation dictionary includes 119K+ english words plus some # proper nouns using the latin alphabet, occasionally with punctuation. self.d = cmudict.dict() def changeLang(self, lang): if lang not in pyphen.LANGUAGES: lang = 'en' self.pyphen = pyphen.Pyphen(lang=lang) self.lang = lang def multiTokenize(self, originalWord): """ Return options for tokenizing a word. """ word = self.deformat(originalWord) tokenizations = [] # If the word exists in our dictionary, include those tokenizations. if (word in self.dict['words']): tokenizations += self.dict['words'][word] # If the word exists in the dictionary (but as singular), include it. elif (word[-1] == 's' and word[0:-1] in self.dict['words']): for tk in self.dict['words'][word[0:-1]]: nt = tk.copy() nt[-1] = nt[-1] + 's' tokenizations.append(nt) # Otherwise, use an algorithm to get word split up into syllables tokenized = self.SSP.tokenize(word) splitter = "\t" hyphenated = self.pyphen.inserted(word, splitter).split(splitter) if self.lang == 'en': tokenizations = self._addMatchingSylCount(word, tokenizations, tokenized, hyphenated) elif self.lang == 'es': # Sonority Sequencing doesn't work well with strong and weak vowels esTokenized = self._spanishTokenize(word) if esTokenized not in tokenizations: tokenizations.append(esTokenized) # Hunspell tokenizations are not as accurate as our tokenized ones: # only include them if the syllable count matches. if hyphenated not in tokenizations and len(hyphenated) == len( esTokenized): tokenizations.append(hyphenated) else: if tokenized not in tokenizations: tokenizations.append(tokenized) if hyphenated not in tokenizations: tokenizations.append(hyphenated) return list( map(self.reformat, tokenizations, [originalWord for x in range(0, len(tokenizations))])) def _addMatchingSylCount(self, word, tokenizations, tokenized, hyphenated): sylCounts = self.nsyl(word) # If the tokenized or hyphenated version has the same number of # syllables as one of the CMU STT pronunciations, but we don't # already have that syllable-count represented, include it. lh = len(hyphenated) if (lh in sylCounts and lh not in map(len, tokenizations)): tokenizations.append(hyphenated) lt = len(tokenized) if (lt in sylCounts and lt not in map(len, tokenizations)): tokenizations.append(tokenized) if (1 in sylCounts and [word] not in tokenizations): tokenizations.append([word]) # Fallback if there are no tokenizations. if (len(tokenizations) == 0): warning("%s has %d syllables," % (str(hyphenated), len(hyphenated)) + ' expected: ' + (" or ".join(map(str, sylCounts)) or "???")) tokenizations.append(hyphenated) return tokenizations def _spanishTokenize(self, word): """ Make sure spanish hyphenated syllable counts are correct https://www.spanishdict.com/guide/spanish-syllables-and-syllabification-rules """ # Accented vowels always get their own syllable. accentedVowels = "áéíóú" # Two strong vowels together are split into different syllables. strongVowels = "aeo" # Weak vowels can blend with each other, or with strong vowels. weakVowels = "iuü" vowels = accentedVowels + strongVowels + weakVowels # Split certain vowel pairs, and let SSP do the rest. newWord = "" prevLetter = " " for letter in word: if (letter in vowels and prevLetter in accentedVowels) or \ (letter in accentedVowels and prevLetter in vowels) or \ (letter in strongVowels and prevLetter in strongVowels): newWord += "-" + letter else: newWord += letter prevLetter = letter # TODO: Fix tokenization for double-r and double-l tokenized = self.SSP.tokenize(newWord) return list(filter(lambda syl: syl != '-', tokenized)) def deformat(self, word): return re.sub("[" + wordSyl.smartSingles + "]", "'", word.lower().strip(wordSyl.puncs)) def reformat(self, oldTokenized, template): # Since tokenized is mutable, create a duplicate of it. tokenized = list(oldTokenized) # Match the case plainTemp = template.strip(wordSyl.puncs) if (plainTemp and plainTemp.isupper()): tokenized[0] = tokenized[0].upper() elif (plainTemp and plainTemp[0].isupper()): tokenized[0] = tokenized[0][0].upper() + tokenized[0][1:] elif (plainTemp and len(tokenized[0]) > 1 and len(plainTemp) > 1 and plainTemp[1].isupper()): tokenized[0] = tokenized[0][0] + \ tokenized[0][1].upper() + tokenized[0][2:] # Prepend/append the punctuations match = re.search(r"^[" + wordSyl.puncs + r"]+", template) starting = match.group(0) if match else '' match = re.search(r"[" + wordSyl.puncs + r"]+$", template) ending = match.group(0) if match else '' tokenized[0] = starting + tokenized[0] tokenized[-1] = tokenized[-1] + ending # Replace smart single-quotes dumbPlaceholder = "\n" splitter = "\t" templateNoDumb = template.replace("'", dumbPlaceholder) for letter in templateNoDumb: if letter in wordSyl.smartSingles + dumbPlaceholder: tokenized = splitter.join(tokenized).replace("'", letter, 1).split(splitter) tokenized = splitter.join(tokenized).replace(dumbPlaceholder, "'").split(splitter) return tokenized def nsyl(self, word): """Get the number of syllables a word should be from the CMU Pronunciation dictionary. Returned as a list to account for variants.""" if word in self.d: return [ len(list(y for y in x if y[-1].isdigit())) for x in self.d[word.lower()] ] return []