def test_long_words(self): computed = hyphenate_word('pneumoultramicroscópico') expected = ['pneu', 'moul', 'tra', 'mi', 'cros', 'có', 'pi', 'co'] self.assertEqual(computed, expected) computed = hyphenate_word( 'pneumoultramicroscopicossilicovulcanoconiose') expected = [ 'pneu', 'moul', 'tra', 'mi', 'cros', 'co', 'pi', 'cos', 'si', 'li', 'co', 'vul', 'ca', 'no', 'co', 'ni', 'o', 'se' ] self.assertEqual(computed, expected) computed = hyphenate_word('anticonstitucionalissimamente') expected = [ 'an', 'ti', 'cons', 'ti', 'tu', 'ci', 'o', 'na', 'lis', 'si', 'ma', 'men', 'te' ] self.assertEqual(computed, expected) computed = hyphenate_word('insignificância') expected = ['in', 'sig', 'ni', 'fi', 'cân', 'ci', 'a'] self.assertEqual(computed, expected) computed = hyphenate_word('pseudomembrana') expected = ['pseu', 'do', 'mem', 'bra', 'na'] self.assertEqual(computed, expected)
def test_exceptions(self): computed = hyphenate_word('software') expected = ['soft', 'ware'] self.assertEqual(computed, expected) computed = hyphenate_word('hardware') expected = ['hard', 'ware'] self.assertEqual(computed, expected)
async def get_auto_splits_associations(word, limit): """Get the most associative splits automatically. Args: word (str): Target words. limit (int): Associations limit. """ tasks = [] max_graded_pair = 0 most_associative_pair = None # Handle short words (don't split - only one syllable or len <= # MIN_WORD_LENGTH). if len(hyphenate_word(word)) == 1 or len(word) < MIN_WORD_LENGTH: result = await get_associations([word], limit) return result # Get associations of all word splits combinations async. for split_index in range(MIN_ASSOCIATION_LENGTH, len(word) - MIN_ASSOCIATION_LENGTH + 1): tasks.append( asyncio.create_task( get_associations([word[:split_index], word[split_index:]], limit=limit), )) pairs_results = await asyncio.gather(*tasks) for pair in pairs_results: current_grade = calculate_splits_grade(*[x.grade for x in pair]) if current_grade > max_graded_pair: max_graded_pair = current_grade most_associative_pair = pair return most_associative_pair
def broize(word, ipa): """Given a word and its IPA pronunciation, return a bro version of the word, or None.""" parts = [hyphenate_word(w) for w in word.split()] flat_parts = list(itertools.chain.from_iterable(parts)) if len(flat_parts) < 2: return None # Find the syllable that contains the phoneme we're going to replace. IPA # uses periods to represent syllable breaks. Not all pronunciations have # them, unfortunately. if '.' in ipa: i = ipa.count('.', 0, ipa.find('oʊ')) try: broized = broize_syllable(flat_parts[i], i) except IndexError: # Sometimes hyphenate returns a result that's shorter than the IPA # suggests. pass else: if broized is None: return None return make_word(parts, broized, i) # Try all the syllables as a last resort for i, p in enumerate(flat_parts): broized = broize_syllable(p, i) if broized is not None: return make_word(parts, broized, i) return None
def tokenize(f_read, f_write, supervised=False): f_out = open(f_write, 'w') first = True if supervised is True: # stressed, unstressed since we construct the lines in reverse stresses = ['U', 'S'] curr_stress = 0 for line in open(f_read).readlines(): if line.strip() == '' or line.strip().isdigit(): continue line_strip = remove_end_line_punct(line.strip()) for word in reversed(line_strip.split()): if supervised is True: res = hyphenate_word(word) if '-' in res: res.remove('-') hy_length = len(res) start_stress = (curr_stress + hy_length) % 2 f_out.write( build_stress_string(stresses, start_stress, hy_length) + "\t") curr_stress = (curr_stress + hy_length) % 2 f_out.write(word.strip('\'').lower() + '\n') f_out.write("@\n") curr_stress = 0 f_out.close()
def speak_system(self, system): #print "## DEBUG Speak system, nato, hyphen", system, self.nato, self.hyphen sys = system if self.hyphen==True: #print "##DEBUG Hyphen speaking" sys = self._system_speakify(system) sys = " ".join(hyphenate_word(sys)) if self.nato==True: #print "##DEBUG Nato speaking" sys = system sys_nato = "" for w in sys.split(" "): if len(w) < self.nato_max_length: sys_nato += nato_spell(w) else: sys_nato += w sys = sys_nato #print "## DEBUG sys", sys if self.nato: self.set_normal() else: self.set_slow() self.speak_now(sys) self.set_normal()
def PodzielNaSylaby(wyraz, krotnosc): sylaby = hyphenate_word(wyraz) for sylaba in sylaby: try: d_sylaby[sylaba] += krotnosc except KeyError: d_sylaby[sylaba] = krotnosc
def generate_emission(self, M, word_map, seed_word, supervised=False): ''' Generates an emission of M syllables, assuming that the starting state is chosen uniformly at random. Arguments: M: Length of the emission to generate. Returns: emission: The randomly generated emission as a string. ''' emission = [] numeric_seed = word_map[seed_word] seed_col = column(self.O, numeric_seed) seed_sum = sum(seed_col) for i in range(len(seed_col)): seed_col[i] /= seed_sum state = np.random.choice(np.arange(self.L), p=seed_col) emission.append(seed_word) M -= len(hyphenate_word(seed_word)) while M > 0: obs = np.random.choice(np.arange(self.D), p=self.O[state]) word = word_map.keys()[word_map.values().index(obs)] while len(hyphenate_word(word)) > M: if supervised is True: state = np.random.choice(np.arange(self.L), p=self.A[state]) obs = np.random.choice(np.arange(self.D), p=self.O[state]) word = word_map.keys()[word_map.values().index(obs)] emission.append(word) M -= len(hyphenate_word(word)) state = np.random.choice(np.arange(self.L), p=self.A[state]) return emission
def test_accent(self): computed = hyphenate_word('pão') expected = ['pão'] self.assertEqual(computed, expected) computed = hyphenate_word('coração') expected = ['co', 'ra', 'ção'] self.assertEqual(computed, expected) computed = hyphenate_word('acarajé') expected = ['a', 'ca', 'ra', 'jé'] self.assertEqual(computed, expected) computed = hyphenate_word('jacaré') expected = ['ja', 'ca', 'ré'] self.assertEqual(computed, expected) computed = hyphenate_word('classificação') expected = ['clas', 'si', 'fi', 'ca', 'ção'] self.assertEqual(computed, expected)
def test_single_syllable(self): computed = hyphenate_word('trem') expected = ['trem'] self.assertEqual(computed, expected) computed = hyphenate_word('a') expected = ['a'] self.assertEqual(computed, expected) computed = hyphenate_word('e') expected = ['e'] self.assertEqual(computed, expected) computed = hyphenate_word('pneu') expected = ['pneu'] self.assertEqual(computed, expected) computed = hyphenate_word('mel') expected = ['mel'] self.assertEqual(computed, expected)
def get_word_with_most_syllables(lyrics): syllable_count_for_each_word = {} max_syllable_words = [] max_syllable_words_dict = {} max_syllables = 0 for word in lyrics: hyphenated_word = hyphenate.hyphenate_word(word) if len(hyphenated_word) > max_syllables: max_syllables = len(hyphenated_word) max_syllable_words_dict = {word: len(hyphenated_word)} max_syllable_words = [word] elif len(hyphenated_word) == max_syllables: max_syllable_words_dict[word] = len(hyphenated_word) max_syllable_words.append(word) return max_syllable_words
def average_syllables_per_line(lyrics): syllable_count_per_line = [] syllables_in_sentence = 0 for sentence in lyrics: for word in sentence: word = hyphenate.hyphenate_word(word) word = len(word) syllables_in_sentence += word syllable_count_per_line.append(syllables_in_sentence) syllables_in_sentence = 0 average_syllables_per_line = sum(syllable_count_per_line) / len( syllable_count_per_line) # print(syllable_count_per_line) return average_syllables_per_line
def get_average_syllable_count(frequency_tokenized_lyrics): hyphenated_lyrics = [] for word in frequency_tokenized_lyrics: hyphenated_lyrics.append(hyphenate.hyphenate_word(word)) # print(hyphenated_lyrics) syllable_count_per_word = [] for hyphenated_word in hyphenated_lyrics: syllable_count_per_word.append(len(hyphenated_word)) # print(str(len(hyphenated_word)) + " " + str(hyphenated_word)) # print(syllable_count_per_word) average_syllable_count = sum(syllable_count_per_word) / len( syllable_count_per_word) average_syllable_count = "{0:0.2f}".format(average_syllable_count) return average_syllable_count
def __init__(self, text): # get runs of repeated characters for collapsing later runs = [(i.start(), i.end()) for i in re.finditer(r'(.)\1{2,}', text)] # if the word is camelCase (or similar), break it into pieces chunks = [] i = 0 for m in self.camelcase_ex.finditer(text): chunks.append(text[i:m.start()]) i = m.start() chunks.append(text[i:]) self.syllables = reduce(lambda x,y: x+hyphenate_word(y), chunks, []) # collapse any long runs of identical characters for run in runs: begin = self._find_syllable(run[0]) end = self._find_syllable(run[1]-1)+1 self.syllables[begin:end] = ["".join(self.syllables[begin:end])]
def hyphen(words, worddict): out = [] for word in words: if word in worddict: out.append(word) else: toks = hyphenate_word(word) split = False for i in range(1, len(toks)): s1 = concat(toks[0:i]) s2 = concat(toks[i:len(toks)]) if worddict.get(s1, None) != None and worddict.get( s2, None) != None: out.append(s1) out.append(s2) split = True if not split: out.append(word) return out
def __init__(self, text): # get runs of repeated characters for collapsing later runs = [(i.start(), i.end()) for i in re.finditer(r'(.)\1{2,}', text)] # if the word is camelCase (or similar), break it into pieces chunks = [] i = 0 for m in self.camelcase_ex.finditer(text): chunks.append(text[i:m.start()]) i = m.start() chunks.append(text[i:]) self.syllables = reduce(lambda x, y: x + hyphenate_word(y), chunks, []) # collapse any long runs of identical characters for run in runs: begin = self._find_syllable(run[0]) end = self._find_syllable(run[1]-1) + 1 self.syllables[begin:end] = [''.join(self.syllables[begin:end])]
def _butt_word(word, butt_pass=0): # Split into left punctuation, word, right punctuation on first pass lp, actual_word, rp = RE_SPLIT_PUNCTUATION.match(word).groups() hyphenated_parts = hyphenate_word(actual_word) if butt_pass > 0 and len(hyphenated_parts) == 1: return word x = 0 points = [0] # Generate 'word' string offsets for splicing for part in hyphenated_parts: x += len(part) points.append(x) offset_index = random.randrange(len(points) - 1) l = points[offset_index] r = points[offset_index + 1] - l # Scan left and right to consume all leading b's and trailing t's to avoid e.g. # !butt Bartering -> Butttering # triple t while (actual_word[l + r:l + r + 1]) == 't': r += 1 while l > 0 and actual_word[l - 1] == 'b': l -= 1 sub = actual_word[l:l + r] butt = 'butt' if not len(sub): sub = actual_word l = 0 r = len(sub) if sub.isupper(): butt = 'BUTT' elif sub[0].isupper(): butt = 'Butt' actual_word = actual_word[:l] + butt + actual_word[l + r:] if len(hyphenated_parts) > 5 and random.randint(0, (4 - butt_pass)) == 1: butt_pass += 1 actual_word = _butt_word(actual_word, butt_pass=butt_pass) return lp + actual_word + rp
def _butt_word(word, butt_pass=0): # Split into left punctuation, word, right punctuation on first pass lp, actual_word, rp = RE_SPLIT_PUNCTUATION.match(word).groups() hyphenated_parts = hyphenate_word(actual_word) if butt_pass > 0 and len(hyphenated_parts) == 1: return word x = 0 points = [0] # Generate 'word' string offsets for splicing for part in hyphenated_parts: x += len(part) points.append(x) offset_index = random.randrange(len(points) - 1) l = points[offset_index] r = points[offset_index + 1] - l # Scan left and right to consume all leading b's and trailing t's to avoid e.g. # !butt Bartering -> Butttering # triple t while (actual_word[l + r: l + r + 1]) == 't': r += 1 while l > 0 and actual_word[l - 1] == 'b': l -= 1 sub = actual_word[l:l+r] butt = 'butt' if not len(sub): sub = actual_word l = 0 r = len(sub) if sub.isupper(): butt = 'BUTT' elif sub[0].isupper(): butt = 'Butt' actual_word = actual_word[:l] + butt + actual_word[l+r:] if len(hyphenated_parts) > 5 and random.randint(0, (4 - butt_pass)) == 1: butt_pass += 1 actual_word = _butt_word(actual_word, butt_pass=butt_pass) return lp + actual_word + rp
def create_syllable(lyrics): ''' written by Yunzheng, this function will strip the punctuations in the given text, and generate syllables as the model input format. It's not perfect because some syllables and not detected as I'm using the pkg called 'hyphenate', which is a little different from syllable detection ''' regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]') lyrics = regex.sub("", lyrics) lyrics = lyrics.split() combinations = [[] for i in range(len(lyrics))] for ly_ind in range(len(lyrics)): combinations[ly_ind].append(hyphenate_word(lyrics[ly_ind])) combinations[ly_ind].append(lyrics[ly_ind]) res = [] for pair in combinations: for syllable in pair[0]: res.append([syllable, pair[1]]) return res
def process_system_name(self, system): #print "## DEBUG Speak system, nato, hyphen", system, self.nato, self.hyphen sys = system if self.hyphen == True: #print "##DEBUG Hyphen speaking" sys = self._numbers_speakify(system) sys = " ".join(hyphenate_word(sys)) if self.nato == True: #print "##DEBUG Nato speaking" sys = system sys_nato = "" for w in sys.split(" "): if len(w) < self.nato_max_length: sys_nato += " " + nato_spell(w) else: sys_nato += " " + w sys = sys_nato #print "## DEBUG sys", sys return sys
def process_system_name(self, system): #print "## DEBUG Speak system, nato, hyphen", system, self.nato, self.hyphen sys = system if self.hyphen==True: #print "##DEBUG Hyphen speaking" sys = self._numbers_speakify(system) sys = " ".join(hyphenate_word(sys)) if self.nato==True: #print "##DEBUG Nato speaking" sys = system sys_nato = "" for w in sys.split(" "): if len(w) < self.nato_max_length: sys_nato += " "+nato_spell(w) else: sys_nato += " "+w sys = sys_nato #print "## DEBUG sys", sys return sys
def wordFromRule(word, rule): syls = [] thematches = [] leng = 0 thesearch = '' syls = hyphenate.hyphenate_word(word) # split the word into syllables leng = len(syls) if leng <= 1: # if there aren't 3 syllables, make snippets. syls = ['','',''] if len(word) >= 1: syls[0] = word[:1] syls[1] = word[1:2] syls[2] = word[-2:] else: syls[0] = word syls[1] = word syls[2] = word if rule in [0,1,4,5]: # Choose the appropriate search thesearch = re.compile(r'%s' % syls[1]) elif rule in [2,3]: thesearch = re.compile(r'^%s' % syls[leng-1]) else: thesearch = re.compile(r'%s$' % syls[leng-1]) wordlist = open('sowpods.txt') # have to open this file each time, for whatever reason. for line in wordlist: # go through the dictionary try: #print "looks at a line" liner = line.strip() # strip whitespace, etc f = thesearch.search(liner) if f: # if the word matches thematches.append(liner) # add it to the list except: raise lm = len(thematches) if lm <= 1: # if there are no matches thematches = ['elephant'] # just use the original word. return random.choice(thematches)
def get_phonemes(word, selection_criteria): """Get the phonetic representation of the syllables after the stress. :param word: String containing the word. :param selection_criteria: Function to filter the selected phonemes. :returns: Syllables corresponding to the word. :rtype: list """ try: key = word.strip().lower() ending = dropwhile(lambda x: '1' not in x, pronounciation[key]) return [p for p in ending if selection_criteria(p)] except KeyError: """ jacquelinekclee's edit: words not in the CMU dict may be compound words. thus, use the hyphenate package to divide such words into its components and try to get the phoneme of the last word component. """ try: key = hyphenate_word(word)[-1] ending = dropwhile(lambda x: '1' not in x, pronounciation[key]) return [p for p in ending if selection_criteria(p)] except KeyError: return []
m = sampa_re_oh.search(line) if m: #print i, title, m.group().replace('"', '"') #if type in ('Noun', 'Proper noun'): words.add(title) #if type == 'Verb': # print i, title #type = 'Unknown' # uncomment this block when trying new heuristics so that we don't have to scan the whole wiktionary # if i > 10000000: # break print len(words) output = [] for w in words: parts = hyphenate_word(w) if len(parts) > 1: try: # Try a bunch of heuristics to turn the word into something coherant after we've crammed a bro in there for i, p in enumerate(parts): if i == 0 and p[1] == 'o': parts[i] = 'BRO' + p[2:] raise WordUsed() if len(p) == 2 and p[1] == 'o': if i > 0 and p[0] in 'tgvdnl': parts[i] = p[0] + 'BRO' else: parts[i] = 'BRO' raise WordUsed() elif p.startswith('o'): parts[i] = 'BR' + p
line3 -= x line3_words.append(x) line1_final = [] line2_final = [] line3_final = [] #if(len(line1_prev) == 0): for numSyllables in line1_words: #get a word from wordsTxt which is the same number of syllables bySyllables = [] randWord = '' while (len(bySyllables) != numSyllables): #or (charToExclude in randWord): i = random.randrange(0, len(wordsList)) randWord = wordsList[i] bySyllables = hyphenate.hyphenate_word(randWord) if len(line1_final) == 0: randWord = randWord.capitalize() elif len(line1_final) == (len(line1_words) - 1): if( random.random() > 0.5): randWord = randWord + ',' line1_final.append(randWord) #if(len(line2_prev) == 0): for numSyllables in line2_words: #get a word from wordsTxt which is the same number of syllables bySyllables = [] randWord = '' while (len(bySyllables) != numSyllables): #or (charToExclude in randWord): i = random.randrange(0, len(wordsList)) randWord = wordsList[i]
def test_two_syllables(self): computed = hyphenate_word('abrir') expected = ['a', 'brir'] self.assertEqual(computed, expected) computed = hyphenate_word('pote') expected = ['po', 'te'] self.assertEqual(computed, expected) computed = hyphenate_word('monte') expected = ['mon', 'te'] self.assertEqual(computed, expected) computed = hyphenate_word('chorei') expected = ['cho', 'rei'] self.assertEqual(computed, expected) computed = hyphenate_word('treta') expected = ['tre', 'ta'] self.assertEqual(computed, expected) computed = hyphenate_word('holmes') expected = ['hol', 'mes'] self.assertEqual(computed, expected) computed = hyphenate_word('ultra') expected = ['ul', 'tra'] self.assertEqual(computed, expected) computed = hyphenate_word('sódio') expected = ['só', 'di', 'o'] self.assertEqual(computed, expected) computed = hyphenate_word('tomé') expected = ['to', 'mé'] self.assertEqual(computed, expected) computed = hyphenate_word('dibre') expected = ['di', 'bre'] self.assertEqual(computed, expected) computed = hyphenate_word('trator') expected = ['tra', 'tor'] self.assertEqual(computed, expected) computed = hyphenate_word('tracto') expected = ['trac', 'to'] self.assertEqual(computed, expected) computed = hyphenate_word('tótem') expected = ['tó', 'tem'] self.assertEqual(computed, expected) computed = hyphenate_word('uber') expected = ['u', 'ber'] self.assertEqual(computed, expected) computed = hyphenate_word('chuva') expected = ['chu', 'va'] self.assertEqual(computed, expected) computed = hyphenate_word('ódio') expected = ['ó', 'di', 'o'] self.assertEqual(computed, expected)
def syllabify_orth_with_hyphenate(token,num_sylls=None): from hyphenate import hyphenate_word l=hyphenate_word(token) if not num_sylls or len(l)==num_sylls: return l return []
content = [line.rstrip() for line in text] # Split up multidimensional array to a single dimensional array with newlines. content = [ "X" if word is not "\n" else "\n" for line in content for word in line.split(' ') + ['\n'] ] # Remove all special characters from a single dimensional array, yield newlines. content = [ ''.join(e for e in word.lower() if e.isalpha() or e == '\n') for word in content ] # split up syllables and add a space before every word. content = [syllable for word in content for syllable in hyphenate_word(word)] print('corpus length:', len(content)) # Create a character set. chars = set() [chars.add(word) for word in content] print('total chars:', len(chars)) char_indices = dict((c, i) for i, c in enumerate(chars)) indices_char = dict((i, c) for i, c in enumerate(chars)) # Remove characters that are not necessary pickle.dump(char_indices, open("char_indic.json", "wb")) pickle.dump(indices_char, open("indic_char.json", "wb"))
def test_medium_words(self): computed = hyphenate_word('abracadabra') expected = ['a', 'bra', 'ca', 'da', 'bra'] self.assertEqual(computed, expected) computed = hyphenate_word('sílaba') expected = ['sí', 'la', 'ba'] self.assertEqual(computed, expected) computed = hyphenate_word('momento') expected = ['mo', 'men', 'to'] self.assertEqual(computed, expected) computed = hyphenate_word('tormento') expected = ['tor', 'men', 'to'] self.assertEqual(computed, expected) computed = hyphenate_word('polenta') expected = ['po', 'len', 'ta'] self.assertEqual(computed, expected) computed = hyphenate_word('ratoeira') expected = ['ra', 'to', 'ei', 'ra'] self.assertEqual(computed, expected) computed = hyphenate_word('beira') expected = ['bei', 'ra'] self.assertEqual(computed, expected) computed = hyphenate_word('cachoeira') expected = ['ca', 'cho', 'ei', 'ra'] self.assertEqual(computed, expected) computed = hyphenate_word('arritmia') expected = ['ar', 'rit', 'mi', 'a'] self.assertEqual(computed, expected) computed = hyphenate_word('chocante') expected = ['cho', 'can', 'te'] self.assertEqual(computed, expected) computed = hyphenate_word('tempestivo') expected = ['tem', 'pes', 'ti', 'vo'] self.assertEqual(computed, expected) computed = hyphenate_word('inexequível') expected = ['i', 'ne', 'xe', 'quí', 'vel'] self.assertEqual(computed, expected) computed = hyphenate_word('carrinho') expected = ['car', 'ri', 'nho'] self.assertEqual(computed, expected) computed = hyphenate_word('adstringir') expected = ['ads', 'trin', 'gir'] self.assertEqual(computed, expected) computed = hyphenate_word('mimosear') expected = ['mi', 'mo', 'se', 'ar'] self.assertEqual(computed, expected) computed = hyphenate_word('cesariana') expected = ['ce', 'sa', 'ri', 'a', 'na'] self.assertEqual(computed, expected) computed = hyphenate_word('abobalhado') expected = ['a', 'bo', 'ba', 'lha', 'do'] self.assertEqual(computed, expected) computed = hyphenate_word('solstício') expected = ['sols', 'tí', 'ci', 'o'] self.assertEqual(computed, expected) computed = hyphenate_word('ilustração') expected = ['i', 'lus', 'tra', 'ção'] self.assertEqual(computed, expected) computed = hyphenate_word('hinduísmo') expected = ['hin', 'du', 'ís', 'mo'] self.assertEqual(computed, expected) computed = hyphenate_word('mãoaberta') expected = ['mão', 'a', 'ber', 'ta'] self.assertEqual(computed, expected) computed = hyphenate_word('múon') expected = ['mú', 'on'] self.assertEqual(computed, expected) computed = hyphenate_word('iodeto') expected = ['i', 'o', 'de', 'to'] self.assertEqual(computed, expected) computed = hyphenate_word('biógrafo') expected = ['bi', 'ó', 'gra', 'fo'] self.assertEqual(computed, expected) computed = hyphenate_word('execração') expected = ['e', 'xe', 'cra', 'ção'] self.assertEqual(computed, expected) computed = hyphenate_word('aeródromo') expected = ['a', 'e', 'ró', 'dro', 'mo'] self.assertEqual(computed, expected)
line3_words.append(x) line1_final = [] line2_final = [] line3_final = [] #if(len(line1_prev) == 0): for numSyllables in line1_words: #get a word from wordsTxt which is the same number of syllables bySyllables = [] randWord = '' while (len(bySyllables) != numSyllables): #or (charToExclude in randWord): i = random.randrange(0, len(wordsList)) randWord = wordsList[i] bySyllables = hyphenate.hyphenate_word(randWord) if len(line1_final) == 0: randWord = randWord.capitalize() elif len(line1_final) == (len(line1_words) - 1): if (random.random() > 0.5): randWord = randWord + ',' line1_final.append(randWord) #if(len(line2_prev) == 0): for numSyllables in line2_words: #get a word from wordsTxt which is the same number of syllables bySyllables = [] randWord = '' while (len(bySyllables) != numSyllables): #or (charToExclude in randWord): i = random.randrange(0, len(wordsList))
def betweensyllables(word): return "\u00AD".join(hyphenate.hyphenate_word(word))
def process_data(spark, input_data, output_data, mode): """ Ingest the Catalog and process all the files included in it, computing reading statistics and storing them in parquet files. Currently handles only english titles. """ if mode == 'local': catalog_data_url = input_data + "catalog/catalog_mini.csv" else: catalog_data_url = input_data + "catalog/catalog.csv" df = spark.read.csv(catalog_data_url, sep='\t', header=True) print(df.printSchema()) catalog_table = df.select(["title", "author", "language", "id", "_url"]) \ .filter("language = 'en'") catalog_table = catalog_table.withColumnRenamed("title", "book_title") \ .withColumnRenamed("_url", "location") \ .withColumnRenamed("id", "book_id") \ .distinct() catalog_table.write.parquet( output_data + "catalog", mode='overwrite', # partitionBy=["author"], ) # Collect the list of files. This can be large ~20k book_url_list = [ input_data + str(x.location)[6:] for x in catalog_table.select("location").collect() ] # Get list of Book ids from filenames files_df = spark.read.text(book_url_list, wholetext=True) print("partition size", files_df.rdd.getNumPartitions()) # Create a book_id column based on filename BOOK_IF_UDF = udf(lambda x: x.split("/")[-1].split(".txt")[0], StringType()) files_df = files_df.withColumn("id", BOOK_IF_UDF(input_file_name())) """ Each Row in the files_df will represent 1 full book length text In the following, we split each row's data to find the number of sentences, the number of words and the number of syllables, in order to compute the grade level and reading ease scores. """ # Regex to detect a sentence from From https://stackoverflow.com/questions/25735644/ # python-regex-for-splitting-text-into-sentences-sentence-tokenizing SENTENCE_REGEX = "(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s" SYLLABLES_UDF = udf(lambda x: len(hyphenate_word(x)), IntegerType()) # Generate list of sentences, words and syllables for each book. sentences = files_df.select( "id", split("value", SENTENCE_REGEX).alias("sentences")) sentence_count = sentences.select( "id", size("sentences").alias("sentence_count")) words_df = files_df.select("id", split("value", ' ').alias("words")) word_count = words_df.select("id", size("words").alias("word_count")) syllables_count = words_df.select("id", explode("words").alias("syllables")) \ .select("id", SYLLABLES_UDF("syllables").alias("syllables")) \ .groupby("id").agg(f_sum("syllables").alias("syllables_count")) # compute all scores results = sentence_count.join(word_count, "id", "inner").join(syllables_count, "id", "inner") results = results.withColumn("grade_level", 0.39 * col("word_count") \ / col("sentence_count") + 11.8 * \ col("syllables_count") / col("word_count") - 15.59) results = results.withColumn("reading_ease", 206.835 - col("word_count") \ / col("sentence_count") - 84.6 * col("syllables_count") \ / col("word_count")) results.write.parquet( path=output_data + "reading_difficulty", mode="overwrite", )
def syllabify_orth_with_hyphenate(token, num_sylls=None): from hyphenate import hyphenate_word return hyphenate_word(token) return l
texts = text_input.split(" \n \n ") splitted_texts = [] progressbar = ProgressBar(len(texts)) for text in texts: splitted_text = "POEMSTART " text = text_to_word_sequence(text, filters=text_filter, lower=True, split=" ") for word in text: if word == "\n": word = "LINEEND" splitted_text += " " + word continue for syllable in hyphenate_word(word): splitted_text += " " + syllable splitted_text += " POEMEND" splitted_texts.append(splitted_text) progressbar.count() print("") # Create an initial tokenizer text_tokenizer = Tokenizer(filters=text_filter, lower=True, split=" ", char_level=False) text_tokenizer.fit_on_texts(splitted_texts) # Generate a list of words that occur more than n times # Generate a list of words that occur less than n times
def syll_count(w): return len(hyphenate_word(w))
def hyphenate(self): self.hyphenated = hyphenate.hyphenate_word(self.full_tok)
async def on_message(message): if client.user.id != message.author.id and not message.author.bot: with open('users.json', 'r+') as userfile: users = json.loads(userfile.read()) if not message.author.id in users.keys(): users[str(message.author.id)] = {"xp": 0, "level": 1} #Add xp to a user's file based off of message length and a modifier users[str(message.author.id)]["xp"] = int(users[str( message.author.id)]["xp"]) + math.floor( len(message.content) / 8) + 10 if int(users[str(message.author.id)]["xp"]) > int(users[str( message.author.id)]["level"]) * 1000: print(message.author.name + " just leveled up!") users[str(message.author.id)]["level"] = int(users[str( message.author.id)]["level"]) + 1 users[str(message.author.id)]["xp"] = 0 embed = discord.Embed( title="Level Up!", description=str(message.author.display_name) + " is now level " + str(users[str(message.author.id)]["level"]) + "!", color=0xbc42f4) if message.author.avatar_url: embed.set_thumbnail(url=message.author.avatar_url) await client.send_message(message.channel, embed=embed) userfile.seek(0) userfile.truncate( 0 ) #erases file before dumping the new json. Shouldn't have to do this but we're here arn't we json.dump(users, userfile, indent=4) if not os.path.isfile( os.path.join('settings', str(message.server.id + '.json'))): shutil.copy2( 'settings.json', os.path.join('settings', str(message.server.id + '.json'))) print("created " + str( os.path.join('settings', str(message.server.id + '.json')))) with open(os.path.join('settings', str(message.server.id + '.json')), 'r') as serversettings: settings = json.loads(serversettings.read()) prefix = str(settings["bot"]["prefix"]) global txtout #.test# if await checkCommand(settings, "test", message): print(':robot:') await client.send_message(message.channel, ':robot:') #.info# elif await checkCommand(settings, "info", message): print('BOT INFO') embed = discord.Embed( title="BOT INFO", description= "Made by @ShiftyWizard#4823 & @Arboreal#4200 for fun.", url="https://github.com/leaharboreal/bot", color=0x1abc9c) embed.set_thumbnail( url= "https://raw.githubusercontent.com/leaharboreal/bot/master/profilepic.png" ) embed.set_footer( text="© bot | 2018", icon_url= "https://raw.githubusercontent.com/leaharboreal/bot/master/profilepic.png" ) await client.send_message(message.channel, embed=embed) elif message.content.lower().startswith( prefix + settings["commands"]["settings"]["command"] ) and message.author.server_permissions.manage_server: #upgrade the file's read permissions to rw# serversettings.close() with open( os.path.join('settings', str(message.server.id + '.json')), 'r+') as serversettings: #convert the current server's settings.json file into a python object# settings = json.loads(serversettings.read()) #set changed to false so if no setting is modified the file will be unchanged# changed = False #check if number of args in message is high enough else reply with syntax error# if len(message.content.lower().split(" ")) > 1: #check if the first argument is commands# if message.content.lower().split(" ")[1] == "commands": #check if number of args in message is high enough for this branch# if len(message.content.lower().split(" ")) > 2: #ensure the command exists and is not the info or settings command# if (message.content.lower().split(" ")[2] in settings["commands"] ) and not (message.content.lower().split( " ")[2] in ["info", "settings"]): #check if number of args in message is high enough for this branch# if len(message.content.lower().split(" ") ) > 3 and len(message.content.lower( ).split(" ")) < 6: #check if the user wants to change the command alias# if message.content.lower().split( " ")[3] == "command": #ensure the command contains only safe characters and is smaller or equal to 16 characters# if re.match( r"^[\w\d~!@#$%^&+=;:,./?\*\-]{1,16}$", message.content.lower(). split(" ")[4]): #set the selected command to the alias# settings["commands"][ message.content.lower( ).split(" ")[2]][ "command"] = message.content.lower( ).split(" ")[4] #set changed to true, ensuring the file is saved# changed = True #set output message to confirmation# txtout = message.content.lower( ).split( " " )[2] + " has been set to " + message.content.lower( ).split(" ")[4] #reject command alias# else: txtout = "Could not set command. Commands can only 1-16 characters long and contain letters, numbers and these symbols: `~!@#$%^&+=;:,./?*-`" #check if the user wants to change if command is enabled# elif message.content.lower().split( " ")[3] == "enabled": #check if argument is true# if message.content.lower().split( " ")[4] == "true": #set the commands enabled value to true# settings["commands"][ message.content.lower( ).split(" ") [2]]["enabled"] = True #set changed to true, ensuring the file is saved# changed = True #set output message to confirmation# txtout = message.content.lower( ).split(" ")[ 2] + " is now `enabled`." #check if answer is false if not true# elif message.content.lower().split( " ")[4] == "false": #set the commands enabled value to false# settings["commands"][str( message.content.lower( ).split(" ") [2])]["enabled"] = False #set changed to true, ensuring the file is saved# changed = True #set output message to confirmation# txtout = message.content.lower( ).split(" ")[ 2] + " is now `disabled`." #reject value as it is not true or false# else: txtout = "This value can only be set to `true` or `false`." #unrecognised argument# else: txtout = "Incorrect syntax(E:2). `" + prefix + "settings commands " + message.content.split( " " )[2] + " <command|enabled> <value>`" #not enough args# else: txtout = "Incorrect syntax (E:1). `" + prefix + "settings commands " + message.content.split( " " )[2] + " <command|enabled> <value>`" #unrecognised/locked argument# else: txtout = "Command `" + message.content.split( " " )[2] + "` not found or cannot be modified. Check the github page command list which can be accessed with `" + prefix + "info`" #not enough args# else: txtout = "Incorrect syntax. `" + prefix + "settings commands <commandname> <command|enabled> <value>`" #check if the first argument is bot, if it isn't commands# elif message.content.lower().split(" ")[1] == "bot": #check if number of args in message is high enough for this branch# if len(message.content.lower().split(" ")) > 2: #check if arg is equal to prefix# if message.content.lower().split( " ")[2] == "prefix": #check if number of args in message is high enough for this branch# if len(message.content.lower().split( " ")) > 3: #check if user's value matches rule# if re.match( r"^[\w\d~!@#$%^&+=;:,./?\*\-]{1,4}$", message.content.split(" ")[3]): #set bot prefix to the user value# settings["bot"][ "prefix"] = message.content.lower( ).split(" ")[3] #set changed to true, ensuring the file is saved# changed = True #set output message to confirmation# txtout = "Prefix set. Bot will respond to commands with the prefix `" + message.content.lower( ).split( " " )[3] + "`. To access settings, use the new prefix." #reject prefix as it does not conform to the character requirements# else: txtout = "Could not set prefix. Prefixes can only 1-4 characters long and contain letters, numbers and these symbols: `~!@#$%^&+=;:,./?*-`" #not enough args# else: txtout = "Incorrect syntax. `" + prefix + "settings bot prefix <value>`" #incorrect arg# else: txtout = "Incorrect syntax. `" + prefix + "settings bot prefix <value>`" #not enough args# else: txtout = "Incorrect syntax. `" + prefix + "settings bot prefix <value>`" #the first argument is incorrect, so respond with a syntax error# else: txtout = "Incorrect syntax. `" + prefix + "settings <commands|bot>`" #not enough args# else: txtout = "Incorrect syntax. `" + prefix + "settings <commands|bot>`" #if the file has been changed at any point, save changes# if changed: serversettings.seek( 0) # reset file position to the beginning. json.dump(settings, serversettings, indent=4) serversettings.truncate() #log output in console then send as discord message# print(txtout) await client.send_message(message.channel, txtout) #.purge elif await checkCommand( settings, "purge", message) and message.server.me.permissions_in( message.channel).manage_messages: if message.author.server_permissions.manage_messages and message.author.server_permissions.manage_server: if len(message.content.lower().split(" ")) == 2: limit = int(message.content.lower().split(" ")[1]) if limit <= 100: if await client.purge_from(message.channel, limit=limit): await client.send_message( message.channel, ":warning: Deleted " + str(limit) + " messages.") else: await client.send_message(message.channel, "100 messages max!") else: await client.send_message( message.channel, "Please specify the number of messages!") else: await client.send_message( message.channel, "You need at least manage messages and manage server permissions to do that!" ) #.level# elif await checkCommand(settings, "level", message): txtout = "Level " + str(users[str( message.author.id)]["level"]) + "\n" + str(users[str( message.author.id)]["xp"]) + " xp" embed = discord.Embed(title=str(message.author.display_name), description=txtout, color=0x42b3f4) if message.author.avatar_url: embed.set_thumbnail(url=message.author.avatar_url) await client.send_message(message.channel, embed=embed) #.addquote# elif await checkCommand(settings, "addquote", message): #check if the quotes file exists for the server, if not, create a file with an empty json object# if not os.path.isfile( os.path.join('quotes', str(message.server.id + '.json'))): with open( os.path.join('quotes', str(message.server.id + '.json')), 'a') as f: f.write("{\n}") #open the server's quote file# with open( os.path.join('quotes', str(message.server.id + '.json')), 'r+') as f: quotes = json.loads( f.read()) #initialize json file as python object# #set quotemessage to the message object before the user's command# quotemessage = await getQuote(message) quote = base64.b64encode( str(quotemessage.content).encode('utf-8')).decode( 'utf-8') #ensure quote does not contain any illegal symbols# if quotemessage.author.id in quotes: #if the user already has a quote object then append quote# quoteid = int( max(quotes[quotemessage.author.id].keys())) + 1 quotes[str(quotemessage.author.id)][quoteid] = quote else: #if they don't have a quote object, create one with their 1st quote# quoteid = 1 quotes[quotemessage.author.id] = {} quotes[quotemessage.author.id][quoteid] = quote print("Added Quote to file " + message.server.id + ".json: " + str(quotemessage.content)) #add log of changes# await client.send_message( message.channel, ":white_check_mark: Added quote: `" + str(quotemessage.content) + "`" ) #confirm addition of quote# #seek to start of file before dumping the new json object in the file# f.seek(0) json.dump(quotes, f, indent=4) f.close() #close the file since we are done with it# #.quote# elif await checkCommand(settings, "quote", message): if os.path.isfile( os.path.join('quotes', str(message.server.id + '.json'))): with open( os.path.join('quotes', str(message.server.id + '.json')), 'r') as f: quotes = json.loads(f.read()) if message.mentions: if message.mentions[0].id in quotes.keys(): quoteauthor = message.mentions[0].id txtout = "```" + base64.b64decode( str(quotes[quoteauthor][random.choice( list(quotes[quoteauthor].keys()))]) ).decode('utf-8') + "```" + message.mentions[ 0].mention else: txtout = "Oops! " + message.mentions[ 0].mention + " hasn't been quoted on this server yet.\nUse `" + prefix + "addquote` when they say something great." else: quoteauthor = await client.get_user_info( random.choice(list(quotes.keys()))) txtout = "```" + base64.b64decode( str(quotes[quoteauthor.id][random.choice( list(quotes[quoteauthor.id].keys()))]) ).decode('utf-8') + "```" + quoteauthor.mention else: txtout = "Oops! No quotes available for this server!\nUse `" + prefix + "addquote` to add quotes." print(txtout) await client.send_message(message.channel, txtout) #@SOMEONE# elif await checkCommand(settings, "@someone", message, atStart=False): x = message.server.members members = [] for member in x: if member.permissions_in( message.channel ).read_messages and member.permissions_in( message.channel).send_messages: members.append(str(member.id)) someone = random.choice(members) print( str(someone) + " was mentioned with @someone by " + str(message.author.id)) txtout = "<@" + someone + ">" + " was randomly mentioned with @someone!" await client.send_message(message.channel, txtout) #Colour elif await checkCommand(settings, "colour", message, atStart=False): colour = message.content.lower().split(" ")[1] if re.match(r"(^#[\d,a-f]{6}$|^#[\d,a-f]{3}$)", colour): colourType = "hex" colour = colour[1:] elif re.match( r"rgb\((25[0-5]|2[0-4]\d|[0,1]\d\d|\d{1,2}),(25[0-5]|2[0-4]\d|[0,1]\d\d|\d{1,2}),(25[0-5]|2[0-4]\d|[0,1]\d\d|\d{1,2})\)", colour): colourType = "rgb" elif re.match( r"hsl\((25[0-5]|2[0-4]\d|[0,1]\d\d|\d{1,2}),(100|\d{1,2})%,(100|\d{1,2})%\)", colour): colourType = "hsl" elif re.match( r"cmyk\((100|\d{1,2}),(100|\d{1,2}),(100|\d{1,2}),(100|\d{1,2})\)", colour): colourType = "cmyk" else: colourType = None if colourType: print("http://thecolorapi.com/id?" + colourType + "=" + colour) with urllib.request.urlopen("http://thecolorapi.com/id?" + colourType + "=" + colour) as url: data = json.loads(url.read().decode()) colourHEX = data["hex"]["value"] colourRGB = data["rgb"]["value"] colourHSL = data["hsl"]["value"] colourCMYK = data["cmyk"]["value"] embed = discord.Embed(title=data["name"]["value"], colour=discord.Colour( int(data["hex"]["value"][1:], 16))) embed.add_field(name="Hex", value=colourHEX, inline=True) embed.add_field(name="RGB", value=colourRGB, inline=True) embed.add_field(name="HSL", value=colourHSL, inline=True) embed.add_field(name="CMYK", value=colourCMYK, inline=True) #embed.set_thumbnail(url=data["image"]["bare"]) embed.set_footer(text="Sourced using thecolorapi") await client.send_message(message.channel, embed=embed) else: await client.send_message(message.channel, "Format not recognised :(") #RETURN STRING WITH SPACES EVERY CHARACTER# elif await checkCommand(settings, "widespace", message): txtout = "" x = "" for char in message.content[ len(prefix + settings["commands"]["widespace"]["command"]):]: if ord(char) in range(33, 127): x = chr(ord(char) + 0xFEE0) else: x = char txtout = txtout + str(x) + " " print(txtout) await client.send_message(message.channel, txtout) #VERBOSE MESSAGE GENERATOR# elif await checkCommand(settings, "verbose", message): txtin = message.content[ len(prefix + settings["commands"]["verbose"]["command"]):] txtout = "" synonyms = [] for word in txtin.split(): for syn in wordnet.synsets(word): for l in syn.lemmas(): synonyms.append(l.name()) if not synonyms: txtout += (" " + word) else: txtout += (" " + max(set(synonyms), key=len)) synonyms = [] txtout = txtout.replace("-", " ") txtout = txtout.replace("_", " ") print(txtout) await client.send_message(message.channel, txtout) #SUCCINCT MESSAGE GENERATOR# elif await checkCommand(settings, "succinct", message): txtin = message.content[ len(prefix + settings["commands"]["succinct"]["command"]):] txtout = "" synonyms = [] for word in txtin.split(): for syn in wordnet.synsets(word): for l in syn.lemmas(): synonyms.append(l.name()) if not synonyms: txtout += (" " + word) else: txtout += (" " + min(set(synonyms), key=len)) synonyms = [] txtout = txtout.replace("-", " ") txtout = txtout.replace("_", " ") print(txtout) await client.send_message(message.channel, txtout) #SMUSH TWO WORDS TOGETHER# elif await checkCommand(settings, "smush", message): worda = message.content.split(" ")[1] wordb = message.content.split(" ")[2] asyl = hyphenate_word(worda) bsyl = hyphenate_word(wordb) if len(bsyl) > 1: txtout = str(asyl[0]) + "".join(bsyl[1:]) else: #in case word 2 is only 1 syllable, just join the words txtout = str(asyl[0]) + "".join(bsyl[0]) txtout = worda + " " + wordb + " (" + txtout + ")" print(txtout) await client.send_message(message.channel, txtout) elif await checkCommand(settings, "dog", message): with urllib.request.urlopen( "https://dog.ceo/api/breeds/image/random") as url: data = json.loads(url.read().decode()) embed = discord.Embed(color=0xeee657) embed.set_image(url=data["message"]) print(data["message"]) await client.send_message(message.channel, embed=embed) elif await checkCommand(settings, "catfact", message): with urllib.request.urlopen( "https://cat-fact.herokuapp.com/facts/random") as url: data = json.loads(url.read().decode()) txtout = data["text"] print(txtout) await client.send_message(message.channel, txtout) #CHOOSE FROM USER SPECIFIED LIST# elif await checkCommand(settings, "choose", message): items = message.content[ len(prefix + settings["commands"]["choose"]["command"]):] txtout = random.choice(items.split("|")) print(txtout) await client.send_message(message.channel, txtout) #"RATE" SOMETHING BY PICKING A NUMBER FROM 1 TO 10# elif await checkCommand(settings, "rate", message): txtout = "I\'d rate " + str(message.content[len( prefix + settings["commands"]["rate"]["command"]):]) + " **" + str( random.randrange(10)) + " out of 10!**" print(txtout) await client.send_message(message.channel, txtout) #FLIP A COIN# elif await checkCommand(settings, "flip", message): embed = discord.Embed(title="Flip", description=random.choice( ['Heads', 'Tails']), color=0xeee657) await client.send_message(message.channel, embed=embed) #PRINT EMOTES# elif await checkCommand(settings, "emotes", message): txtout = "" with open('emotes.txt', 'r') as file: lines = list(file) for x in range(int(message.content.lower().split(" ")[1])): txtout += str(random.choice(lines)).rstrip() print(message.content.lower().split(" ")[1] + " emojis") await client.send_message(message.channel, txtout) #PERFORM XKCD37# elif settings["commands"]["xkcd37"][ "enabled"] == True and re.search(xkcd37, message.content): txtout = re.sub(xkcd37, xkcd37sub, message.content, 0) txtout = "```> " + txtout + "``` xkcd 37" print(txtout) await client.send_message(message.channel, txtout) #DADBOT# elif settings["commands"]["dadbot"]["enabled"] == True and ( message.content.lower().split(" ")[0] == 'i\'m' or message.content.lower().split(" ")[0] == 'im'): txtout = "Hi " + " ".join( message.content.split(" ")[1:]) + ", I'm Dad" print(txtout) await client.send_message(message.channel, txtout) #REPLY TO COMMENTS ABOUT BOT# elif 'bot' in message.content.lower().split( " ") and settings["commands"]["bot"]["enabled"] == True: txtin = sid.polarity_scores(message.content) if float(txtin['compound']) >= 0.2: txtout = ':heart:' elif float(txtin['compound']) <= 0.15 and float( txtin['compound']) >= -0.15: txtout = 'I am unfeeling' else: txtout = 'no u' print(txtout + ' ' + str(txtin['compound'])) await client.send_message(message.channel, txtout) elif 'oopsie' in message.content.lower().split( " ") and settings["commands"]["oopsie"]["enabled"] == True: txtout = "OOPSIE WOOPSIE!! Uwu We make a fucky wucky!! A wittle fucko boingo! The code monkeys at our headquarters are working VEWY HAWD to fix this!" print(txtout) await client.send_message(message.channel, txtout) #GARBAGE MEME# elif settings["commands"]["peestream"][ "command"] in message.content.lower( ) and settings["commands"]["peestream"]["enabled"] == True: embed = discord.Embed(color=0xeee657) embed.set_image( url= "https://cdn.discordapp.com/attachments/260061122193784833/404628539728723969/chrome_2018-01-07_20-25-17.jpg" ) print("Pee Stream") await client.send_message(message.channel, embed=embed) txtout = ""
def hyphenate_word(word): """Adapter around the hyphenate function""" return len(hyphenate.hyphenate_word(word))
audioisvalid = False print("Sample", audio_name, "not found in recombine loop", audio_pathlist, "was pathlist", en_word, "is", oz_word) export_word() # make sure audio segment is empty for next word combined_audio = AudioSegment.empty() phonemes = str(phonemes).replace('[', '').replace(']', '').replace( "'", '').replace('"', '').replace(',', '') # update the lexicon dataframe if the word is not already known if not df4['oz_word'].eq(oz_word).any(): df4.loc[len(df4.index)] = [oz_word, phonemes] else: # print(en_word, " - no translation found in dataframe") en_hyphens = hyphenate_word(en_word) trl_word = en_word trl_hyphens = en_hyphens temp_hyphens = [] for hyphen in en_hyphens: for key in forbidden_letters.keys(): hyphen = hyphen.replace(key, forbidden_letters[key]) temp_hyphens.append(hyphen) for key in forbidden_letters.keys(): trl_word = trl_word.replace(key, forbidden_letters[key]) # print(trl_word) hyphens = temp_hyphens if len(en_hyphens) == 1: word_len = len(str(en_word)) hyphens_len = math.sqrt(len(trl_word)) hyphens_rounded = int(hyphens_len)