Example #1
0
    def test_long_words(self):
        computed = hyphenate_word('pneumoultramicroscópico')
        expected = ['pneu', 'moul', 'tra', 'mi', 'cros', 'có', 'pi', 'co']
        self.assertEqual(computed, expected)

        computed = hyphenate_word(
            'pneumoultramicroscopicossilicovulcanoconiose')
        expected = [
            'pneu', 'moul', 'tra', 'mi', 'cros', 'co', 'pi', 'cos', 'si', 'li',
            'co', 'vul', 'ca', 'no', 'co', 'ni', 'o', 'se'
        ]
        self.assertEqual(computed, expected)

        computed = hyphenate_word('anticonstitucionalissimamente')
        expected = [
            'an', 'ti', 'cons', 'ti', 'tu', 'ci', 'o', 'na', 'lis', 'si', 'ma',
            'men', 'te'
        ]
        self.assertEqual(computed, expected)

        computed = hyphenate_word('insignificância')
        expected = ['in', 'sig', 'ni', 'fi', 'cân', 'ci', 'a']
        self.assertEqual(computed, expected)

        computed = hyphenate_word('pseudomembrana')
        expected = ['pseu', 'do', 'mem', 'bra', 'na']
        self.assertEqual(computed, expected)
Example #2
0
    def test_exceptions(self):
        computed = hyphenate_word('software')
        expected = ['soft', 'ware']
        self.assertEqual(computed, expected)

        computed = hyphenate_word('hardware')
        expected = ['hard', 'ware']
        self.assertEqual(computed, expected)
Example #3
0
async def get_auto_splits_associations(word, limit):
    """Get the most associative splits automatically.
    
    Args:
        word (str): Target words.
        limit (int): Associations limit.
    """
    tasks = []
    max_graded_pair = 0
    most_associative_pair = None

    # Handle short words (don't split - only one syllable or len <=
    # MIN_WORD_LENGTH).
    if len(hyphenate_word(word)) == 1 or len(word) < MIN_WORD_LENGTH:
        result = await get_associations([word], limit)
        return result

    # Get associations of all word splits combinations async.
    for split_index in range(MIN_ASSOCIATION_LENGTH,
                             len(word) - MIN_ASSOCIATION_LENGTH + 1):
        tasks.append(
            asyncio.create_task(
                get_associations([word[:split_index], word[split_index:]],
                                 limit=limit), ))
    pairs_results = await asyncio.gather(*tasks)

    for pair in pairs_results:
        current_grade = calculate_splits_grade(*[x.grade for x in pair])
        if current_grade > max_graded_pair:
            max_graded_pair = current_grade
            most_associative_pair = pair
    return most_associative_pair
def broize(word, ipa):
    """Given a word and its IPA pronunciation, return a bro version of the word, or None."""
    parts = [hyphenate_word(w) for w in word.split()]
    flat_parts = list(itertools.chain.from_iterable(parts))
    if len(flat_parts) < 2:
        return None

    # Find the syllable that contains the phoneme we're going to replace. IPA
    # uses periods to represent syllable breaks. Not all pronunciations have
    # them, unfortunately.
    if '.' in ipa:
        i = ipa.count('.', 0, ipa.find('oʊ'))
        try:
            broized = broize_syllable(flat_parts[i], i)
        except IndexError:
            # Sometimes hyphenate returns a result that's shorter than the IPA
            # suggests.
            pass
        else:
            if broized is None:
                return None
            return make_word(parts, broized, i)

    # Try all the syllables as a last resort
    for i, p in enumerate(flat_parts):
        broized = broize_syllable(p, i)
        if broized is not None:
            return make_word(parts, broized, i)
    return None
Example #5
0
def tokenize(f_read, f_write, supervised=False):
    f_out = open(f_write, 'w')
    first = True

    if supervised is True:
        # stressed, unstressed since we construct the lines in reverse
        stresses = ['U', 'S']
        curr_stress = 0

    for line in open(f_read).readlines():
        if line.strip() == '' or line.strip().isdigit():
            continue

        line_strip = remove_end_line_punct(line.strip())

        for word in reversed(line_strip.split()):
            if supervised is True:
                res = hyphenate_word(word)
                if '-' in res:
                    res.remove('-')

                hy_length = len(res)
                start_stress = (curr_stress + hy_length) % 2
                f_out.write(
                    build_stress_string(stresses, start_stress, hy_length) +
                    "\t")
                curr_stress = (curr_stress + hy_length) % 2

            f_out.write(word.strip('\'').lower() + '\n')
        f_out.write("@\n")
        curr_stress = 0

    f_out.close()
Example #6
0
    def speak_system(self, system):

        #print "## DEBUG Speak system, nato, hyphen", system, self.nato, self.hyphen
        sys = system

        if self.hyphen==True:
            #print "##DEBUG Hyphen speaking"
            sys = self._system_speakify(system)
            sys = " ".join(hyphenate_word(sys))

        if self.nato==True:
            #print "##DEBUG Nato speaking"
            sys = system
            sys_nato = ""
            for w in sys.split(" "):
                if len(w) < self.nato_max_length:
                    sys_nato += nato_spell(w)
                else:
                    sys_nato += w
            sys = sys_nato
        #print "## DEBUG sys", sys

        if self.nato:
            self.set_normal()
        else:
            self.set_slow()

        self.speak_now(sys)
        self.set_normal()
Example #7
0
def PodzielNaSylaby(wyraz, krotnosc):
    sylaby = hyphenate_word(wyraz)
    for sylaba in sylaby:
        try:
            d_sylaby[sylaba] += krotnosc
        except KeyError:
            d_sylaby[sylaba] = krotnosc
Example #8
0
File: HMM.py Project: joshc/sonnet
    def generate_emission(self, M, word_map, seed_word, supervised=False):
        '''
        Generates an emission of M syllables, assuming that the starting state
        is chosen uniformly at random. 

        Arguments:
            M:          Length of the emission to generate.

        Returns:
            emission:   The randomly generated emission as a string.
        '''
        emission = []
        numeric_seed = word_map[seed_word]
        
        seed_col = column(self.O, numeric_seed)
        seed_sum = sum(seed_col)

        for i in range(len(seed_col)):
            seed_col[i] /= seed_sum
 
        state = np.random.choice(np.arange(self.L), p=seed_col)
        emission.append(seed_word)
        M -= len(hyphenate_word(seed_word))

        while M > 0:
            obs = np.random.choice(np.arange(self.D), p=self.O[state])
            word = word_map.keys()[word_map.values().index(obs)]
            
            while len(hyphenate_word(word)) > M:
                if supervised is True:
                    state = np.random.choice(np.arange(self.L), 
                    p=self.A[state])

                obs = np.random.choice(np.arange(self.D), p=self.O[state])
                word = word_map.keys()[word_map.values().index(obs)]
            
            emission.append(word)
            M -= len(hyphenate_word(word))

            state = np.random.choice(np.arange(self.L), 
                    p=self.A[state])
            

        return emission
Example #9
0
    def test_accent(self):
        computed = hyphenate_word('pão')
        expected = ['pão']
        self.assertEqual(computed, expected)

        computed = hyphenate_word('coração')
        expected = ['co', 'ra', 'ção']
        self.assertEqual(computed, expected)

        computed = hyphenate_word('acarajé')
        expected = ['a', 'ca', 'ra', 'jé']
        self.assertEqual(computed, expected)

        computed = hyphenate_word('jacaré')
        expected = ['ja', 'ca', 'ré']
        self.assertEqual(computed, expected)

        computed = hyphenate_word('classificação')
        expected = ['clas', 'si', 'fi', 'ca', 'ção']
        self.assertEqual(computed, expected)
Example #10
0
    def test_single_syllable(self):
        computed = hyphenate_word('trem')
        expected = ['trem']
        self.assertEqual(computed, expected)

        computed = hyphenate_word('a')
        expected = ['a']
        self.assertEqual(computed, expected)

        computed = hyphenate_word('e')
        expected = ['e']
        self.assertEqual(computed, expected)

        computed = hyphenate_word('pneu')
        expected = ['pneu']
        self.assertEqual(computed, expected)

        computed = hyphenate_word('mel')
        expected = ['mel']
        self.assertEqual(computed, expected)
Example #11
0
def get_word_with_most_syllables(lyrics):
    syllable_count_for_each_word = {}
    max_syllable_words = []
    max_syllable_words_dict = {}
    max_syllables = 0
    for word in lyrics:
        hyphenated_word = hyphenate.hyphenate_word(word)
        if len(hyphenated_word) > max_syllables:
            max_syllables = len(hyphenated_word)
            max_syllable_words_dict = {word: len(hyphenated_word)}
            max_syllable_words = [word]
        elif len(hyphenated_word) == max_syllables:
            max_syllable_words_dict[word] = len(hyphenated_word)
            max_syllable_words.append(word)

    return max_syllable_words
Example #12
0
def average_syllables_per_line(lyrics):

    syllable_count_per_line = []
    syllables_in_sentence = 0

    for sentence in lyrics:
        for word in sentence:
            word = hyphenate.hyphenate_word(word)
            word = len(word)
            syllables_in_sentence += word
        syllable_count_per_line.append(syllables_in_sentence)
        syllables_in_sentence = 0

    average_syllables_per_line = sum(syllable_count_per_line) / len(
        syllable_count_per_line)
    # print(syllable_count_per_line)
    return average_syllables_per_line
Example #13
0
def get_average_syllable_count(frequency_tokenized_lyrics):

    hyphenated_lyrics = []
    for word in frequency_tokenized_lyrics:
        hyphenated_lyrics.append(hyphenate.hyphenate_word(word))
    # print(hyphenated_lyrics)

    syllable_count_per_word = []
    for hyphenated_word in hyphenated_lyrics:
        syllable_count_per_word.append(len(hyphenated_word))
        # print(str(len(hyphenated_word)) + " " + str(hyphenated_word))

    # print(syllable_count_per_word)

    average_syllable_count = sum(syllable_count_per_word) / len(
        syllable_count_per_word)
    average_syllable_count = "{0:0.2f}".format(average_syllable_count)
    return average_syllable_count
Example #14
0
    def __init__(self, text):
        # get runs of repeated characters for collapsing later
        runs = [(i.start(), i.end()) for i in re.finditer(r'(.)\1{2,}', text)]

        # if the word is camelCase (or similar), break it into pieces
        chunks = []
        i = 0
        for m in self.camelcase_ex.finditer(text):
            chunks.append(text[i:m.start()])
            i = m.start()
        chunks.append(text[i:])

        self.syllables = reduce(lambda x,y: x+hyphenate_word(y), chunks, [])

        # collapse any long runs of identical characters
        for run in runs:
            begin = self._find_syllable(run[0])
            end   = self._find_syllable(run[1]-1)+1
            self.syllables[begin:end] = ["".join(self.syllables[begin:end])]
Example #15
0
def hyphen(words, worddict):
    out = []
    for word in words:
        if word in worddict:
            out.append(word)
        else:
            toks = hyphenate_word(word)
            split = False
            for i in range(1, len(toks)):
                s1 = concat(toks[0:i])
                s2 = concat(toks[i:len(toks)])
                if worddict.get(s1, None) != None and worddict.get(
                        s2, None) != None:
                    out.append(s1)
                    out.append(s2)
                    split = True
            if not split:
                out.append(word)
    return out
Example #16
0
    def __init__(self, text):
        # get runs of repeated characters for collapsing later
        runs = [(i.start(), i.end()) for i in re.finditer(r'(.)\1{2,}', text)]

        # if the word is camelCase (or similar), break it into pieces
        chunks = []
        i = 0
        for m in self.camelcase_ex.finditer(text):
            chunks.append(text[i:m.start()])
            i = m.start()
        chunks.append(text[i:])

        self.syllables = reduce(lambda x, y: x + hyphenate_word(y), chunks, [])

        # collapse any long runs of identical characters
        for run in runs:
            begin = self._find_syllable(run[0])
            end   = self._find_syllable(run[1]-1) + 1
            self.syllables[begin:end] = [''.join(self.syllables[begin:end])]
Example #17
0
def _butt_word(word, butt_pass=0):
    # Split into left punctuation, word, right punctuation on first pass
    lp, actual_word, rp = RE_SPLIT_PUNCTUATION.match(word).groups()

    hyphenated_parts = hyphenate_word(actual_word)
    if butt_pass > 0 and len(hyphenated_parts) == 1:
        return word

    x = 0
    points = [0]
    # Generate 'word' string offsets for splicing
    for part in hyphenated_parts:
        x += len(part)
        points.append(x)

    offset_index = random.randrange(len(points) - 1)

    l = points[offset_index]
    r = points[offset_index + 1] - l
    # Scan left and right to consume all leading b's and trailing t's to avoid e.g.
    # !butt Bartering -> Butttering # triple t
    while (actual_word[l + r:l + r + 1]) == 't':
        r += 1
    while l > 0 and actual_word[l - 1] == 'b':
        l -= 1
    sub = actual_word[l:l + r]
    butt = 'butt'
    if not len(sub):
        sub = actual_word
        l = 0
        r = len(sub)
    if sub.isupper():
        butt = 'BUTT'
    elif sub[0].isupper():
        butt = 'Butt'

    actual_word = actual_word[:l] + butt + actual_word[l + r:]
    if len(hyphenated_parts) > 5 and random.randint(0, (4 - butt_pass)) == 1:
        butt_pass += 1
        actual_word = _butt_word(actual_word, butt_pass=butt_pass)
    return lp + actual_word + rp
Example #18
0
def _butt_word(word, butt_pass=0):
	# Split into left punctuation, word, right punctuation on first pass
	lp, actual_word, rp = RE_SPLIT_PUNCTUATION.match(word).groups()

	hyphenated_parts = hyphenate_word(actual_word)
	if butt_pass > 0 and len(hyphenated_parts) == 1:
		return word

	x = 0
	points = [0]
	# Generate 'word' string offsets for splicing
	for part in hyphenated_parts:
		x += len(part)
		points.append(x)

	offset_index = random.randrange(len(points) - 1)

	l = points[offset_index]
	r = points[offset_index + 1] - l
	# Scan left and right to consume all leading b's and trailing t's to avoid e.g.
	# !butt Bartering -> Butttering # triple t
	while (actual_word[l + r: l + r + 1]) == 't':
		r += 1
	while l > 0 and actual_word[l - 1] == 'b':
		l -= 1
	sub = actual_word[l:l+r]
	butt = 'butt'
	if not len(sub):
		sub = actual_word
		l = 0
		r = len(sub)
	if sub.isupper():
		butt = 'BUTT'
	elif sub[0].isupper():
		butt = 'Butt'

	actual_word = actual_word[:l] + butt + actual_word[l+r:]
	if len(hyphenated_parts) > 5 and random.randint(0, (4 - butt_pass)) == 1:
		butt_pass += 1
		actual_word = _butt_word(actual_word, butt_pass=butt_pass)
	return lp + actual_word + rp
Example #19
0
def create_syllable(lyrics):
    '''
    written by Yunzheng, this function will strip the punctuations in the given text,
    and generate syllables as the model input format.
    It's not perfect because some syllables and not detected as I'm using the pkg called 'hyphenate',
    which is a little different from syllable detection
    '''
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]')
    lyrics = regex.sub("", lyrics)
    lyrics = lyrics.split()

    combinations = [[] for i in range(len(lyrics))]
    for ly_ind in range(len(lyrics)):
        combinations[ly_ind].append(hyphenate_word(lyrics[ly_ind]))
        combinations[ly_ind].append(lyrics[ly_ind])

    res = []
    for pair in combinations:
        for syllable in pair[0]:
            res.append([syllable, pair[1]])
    return res
Example #20
0
    def process_system_name(self, system):
        #print "## DEBUG Speak system, nato, hyphen", system, self.nato, self.hyphen
        sys = system

        if self.hyphen == True:
            #print "##DEBUG Hyphen speaking"
            sys = self._numbers_speakify(system)
            sys = " ".join(hyphenate_word(sys))

        if self.nato == True:
            #print "##DEBUG Nato speaking"
            sys = system
            sys_nato = ""
            for w in sys.split(" "):
                if len(w) < self.nato_max_length:
                    sys_nato += " " + nato_spell(w)
                else:
                    sys_nato += " " + w
            sys = sys_nato
        #print "## DEBUG sys", sys

        return sys
Example #21
0
    def process_system_name(self, system):
        #print "## DEBUG Speak system, nato, hyphen", system, self.nato, self.hyphen
        sys = system

        if self.hyphen==True:
            #print "##DEBUG Hyphen speaking"
            sys = self._numbers_speakify(system)
            sys = " ".join(hyphenate_word(sys))

        if self.nato==True:
            #print "##DEBUG Nato speaking"
            sys = system
            sys_nato = ""
            for w in sys.split(" "):
                if len(w) < self.nato_max_length:
                    sys_nato += " "+nato_spell(w)
                else:
                    sys_nato += " "+w
            sys = sys_nato
        #print "## DEBUG sys", sys

        return sys
def wordFromRule(word, rule):
	syls = []
	thematches = []
	leng = 0
	thesearch = ''
	syls = hyphenate.hyphenate_word(word) # split the word into syllables
	leng = len(syls)
	if leng <= 1: # if there aren't 3 syllables, make snippets.
		syls = ['','','']
		if len(word) >= 1:
			syls[0] = word[:1]
			syls[1] = word[1:2]
			syls[2] = word[-2:]
		else:
			syls[0] = word
			syls[1] = word
			syls[2] = word
	if rule in [0,1,4,5]: # Choose the appropriate search
		thesearch = re.compile(r'%s' % syls[1])
	elif rule in [2,3]:
		thesearch = re.compile(r'^%s' % syls[leng-1])
	else:
		thesearch = re.compile(r'%s$' % syls[leng-1])
	wordlist = open('sowpods.txt') # have to open this file each time, for whatever reason.
	for line in wordlist: # go through the dictionary
		try:
			#print "looks at a line"
			liner = line.strip() # strip whitespace, etc
			f = thesearch.search(liner)
			if f: # if the word matches
				thematches.append(liner) # add it to the list
		except:
			raise
			
	lm = len(thematches)
	if lm <= 1: # if there are no matches
		thematches = ['elephant'] # just use the original word.
	return random.choice(thematches)
def get_phonemes(word, selection_criteria):
    """Get the phonetic representation of the syllables after the stress.
    :param word: String containing the word.
    :param selection_criteria: Function to filter the selected phonemes.
    :returns: Syllables corresponding to the word.
    :rtype: list
    """
    try:
        key = word.strip().lower()
        ending = dropwhile(lambda x: '1' not in x, pronounciation[key])
        return [p for p in ending if selection_criteria(p)]
    except KeyError:
        """
        jacquelinekclee's edit: words not in the CMU dict may be compound words.
        thus, use the hyphenate package to divide such words into its components and 
        try to get the phoneme of the last word component. 
        """
        try:
            key = hyphenate_word(word)[-1]
            ending = dropwhile(lambda x: '1' not in x, pronounciation[key])
            return [p for p in ending if selection_criteria(p)]
        except KeyError:
            return []
        m = sampa_re_oh.search(line)
        if m:
            #print i, title, m.group().replace('&quot;', '"')
            #if type in ('Noun', 'Proper noun'):
            words.add(title)
            #if type == 'Verb':
            #    print i, title
            #type = 'Unknown'
            # uncomment this block when trying new heuristics so that we don't have to scan the whole wiktionary
            # if i > 10000000:
            #     break

print len(words)
output = []
for w in words:
    parts = hyphenate_word(w)
    if len(parts) > 1:
        try:
            # Try a bunch of heuristics to turn the word into something coherant after we've crammed a bro in there
            for i, p in enumerate(parts):
                if i == 0 and p[1] == 'o':
                    parts[i] = 'BRO' + p[2:]
                    raise WordUsed()
                if len(p) == 2 and p[1] == 'o':
                    if i > 0 and p[0] in 'tgvdnl':
                        parts[i] = p[0] + 'BRO'
                    else:
                        parts[i] = 'BRO'
                    raise WordUsed()
                elif p.startswith('o'):
                    parts[i] = 'BR' + p
     line3 -= x
     line3_words.append(x)
 
 line1_final = []
 line2_final = []
 line3_final = []
 
 #if(len(line1_prev) == 0):
 for numSyllables in line1_words:
     #get a word from wordsTxt which is the same number of syllables
     bySyllables = []
     randWord = ''
     while (len(bySyllables) != numSyllables): #or (charToExclude in randWord):
             i = random.randrange(0, len(wordsList))
             randWord = wordsList[i]
             bySyllables = hyphenate.hyphenate_word(randWord)
     if len(line1_final) == 0:
         randWord = randWord.capitalize()
     elif len(line1_final) == (len(line1_words) - 1):
         if( random.random() > 0.5):
             randWord = randWord + ','
     line1_final.append(randWord)
     
 #if(len(line2_prev) == 0):
 for numSyllables in line2_words:
     #get a word from wordsTxt which is the same number of syllables
     bySyllables = []
     randWord = ''
     while (len(bySyllables) != numSyllables): #or (charToExclude in randWord):
             i = random.randrange(0, len(wordsList))
             randWord = wordsList[i]
Example #26
0
    def test_two_syllables(self):
        computed = hyphenate_word('abrir')
        expected = ['a', 'brir']
        self.assertEqual(computed, expected)

        computed = hyphenate_word('pote')
        expected = ['po', 'te']
        self.assertEqual(computed, expected)

        computed = hyphenate_word('monte')
        expected = ['mon', 'te']
        self.assertEqual(computed, expected)

        computed = hyphenate_word('chorei')
        expected = ['cho', 'rei']
        self.assertEqual(computed, expected)

        computed = hyphenate_word('treta')
        expected = ['tre', 'ta']
        self.assertEqual(computed, expected)

        computed = hyphenate_word('holmes')
        expected = ['hol', 'mes']
        self.assertEqual(computed, expected)

        computed = hyphenate_word('ultra')
        expected = ['ul', 'tra']
        self.assertEqual(computed, expected)

        computed = hyphenate_word('sódio')
        expected = ['só', 'di', 'o']
        self.assertEqual(computed, expected)

        computed = hyphenate_word('tomé')
        expected = ['to', 'mé']
        self.assertEqual(computed, expected)

        computed = hyphenate_word('dibre')
        expected = ['di', 'bre']
        self.assertEqual(computed, expected)

        computed = hyphenate_word('trator')
        expected = ['tra', 'tor']
        self.assertEqual(computed, expected)

        computed = hyphenate_word('tracto')
        expected = ['trac', 'to']
        self.assertEqual(computed, expected)

        computed = hyphenate_word('tótem')
        expected = ['tó', 'tem']
        self.assertEqual(computed, expected)

        computed = hyphenate_word('uber')
        expected = ['u', 'ber']
        self.assertEqual(computed, expected)

        computed = hyphenate_word('chuva')
        expected = ['chu', 'va']
        self.assertEqual(computed, expected)

        computed = hyphenate_word('ódio')
        expected = ['ó', 'di', 'o']
        self.assertEqual(computed, expected)
Example #27
0
def syllabify_orth_with_hyphenate(token,num_sylls=None):
	from hyphenate import hyphenate_word
	l=hyphenate_word(token)
	if not num_sylls or len(l)==num_sylls:
		return l
	return []
Example #28
0
    content = [line.rstrip() for line in text]

# Split up multidimensional array to a single dimensional array with newlines.
content = [
    "X" if word is not "\n" else "\n" for line in content
    for word in line.split(' ') + ['\n']
]

# Remove all special characters from a single dimensional array, yield newlines.
content = [
    ''.join(e for e in word.lower() if e.isalpha() or e == '\n')
    for word in content
]

# split up syllables and add a space before every word.
content = [syllable for word in content for syllable in hyphenate_word(word)]

print('corpus length:', len(content))

# Create a character set.
chars = set()
[chars.add(word) for word in content]

print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

# Remove characters that are not necessary
pickle.dump(char_indices, open("char_indic.json", "wb"))
pickle.dump(indices_char, open("indic_char.json", "wb"))
Example #29
0
    def test_medium_words(self):
        computed = hyphenate_word('abracadabra')
        expected = ['a', 'bra', 'ca', 'da', 'bra']
        self.assertEqual(computed, expected)

        computed = hyphenate_word('sílaba')
        expected = ['sí', 'la', 'ba']
        self.assertEqual(computed, expected)

        computed = hyphenate_word('momento')
        expected = ['mo', 'men', 'to']
        self.assertEqual(computed, expected)

        computed = hyphenate_word('tormento')
        expected = ['tor', 'men', 'to']
        self.assertEqual(computed, expected)

        computed = hyphenate_word('polenta')
        expected = ['po', 'len', 'ta']
        self.assertEqual(computed, expected)

        computed = hyphenate_word('ratoeira')
        expected = ['ra', 'to', 'ei', 'ra']
        self.assertEqual(computed, expected)

        computed = hyphenate_word('beira')
        expected = ['bei', 'ra']
        self.assertEqual(computed, expected)

        computed = hyphenate_word('cachoeira')
        expected = ['ca', 'cho', 'ei', 'ra']
        self.assertEqual(computed, expected)

        computed = hyphenate_word('arritmia')
        expected = ['ar', 'rit', 'mi', 'a']
        self.assertEqual(computed, expected)

        computed = hyphenate_word('chocante')
        expected = ['cho', 'can', 'te']
        self.assertEqual(computed, expected)

        computed = hyphenate_word('tempestivo')
        expected = ['tem', 'pes', 'ti', 'vo']
        self.assertEqual(computed, expected)

        computed = hyphenate_word('inexequível')
        expected = ['i', 'ne', 'xe', 'quí', 'vel']
        self.assertEqual(computed, expected)

        computed = hyphenate_word('carrinho')
        expected = ['car', 'ri', 'nho']
        self.assertEqual(computed, expected)

        computed = hyphenate_word('adstringir')
        expected = ['ads', 'trin', 'gir']
        self.assertEqual(computed, expected)

        computed = hyphenate_word('mimosear')
        expected = ['mi', 'mo', 'se', 'ar']
        self.assertEqual(computed, expected)

        computed = hyphenate_word('cesariana')
        expected = ['ce', 'sa', 'ri', 'a', 'na']
        self.assertEqual(computed, expected)

        computed = hyphenate_word('abobalhado')
        expected = ['a', 'bo', 'ba', 'lha', 'do']
        self.assertEqual(computed, expected)

        computed = hyphenate_word('solstício')
        expected = ['sols', 'tí', 'ci', 'o']
        self.assertEqual(computed, expected)

        computed = hyphenate_word('ilustração')
        expected = ['i', 'lus', 'tra', 'ção']
        self.assertEqual(computed, expected)

        computed = hyphenate_word('hinduísmo')
        expected = ['hin', 'du', 'ís', 'mo']
        self.assertEqual(computed, expected)

        computed = hyphenate_word('mãoaberta')
        expected = ['mão', 'a', 'ber', 'ta']
        self.assertEqual(computed, expected)

        computed = hyphenate_word('múon')
        expected = ['mú', 'on']
        self.assertEqual(computed, expected)

        computed = hyphenate_word('iodeto')
        expected = ['i', 'o', 'de', 'to']
        self.assertEqual(computed, expected)

        computed = hyphenate_word('biógrafo')
        expected = ['bi', 'ó', 'gra', 'fo']
        self.assertEqual(computed, expected)

        computed = hyphenate_word('execração')
        expected = ['e', 'xe', 'cra', 'ção']
        self.assertEqual(computed, expected)

        computed = hyphenate_word('aeródromo')
        expected = ['a', 'e', 'ró', 'dro', 'mo']
        self.assertEqual(computed, expected)
Example #30
0
        line3_words.append(x)

    line1_final = []
    line2_final = []
    line3_final = []

    #if(len(line1_prev) == 0):
    for numSyllables in line1_words:
        #get a word from wordsTxt which is the same number of syllables
        bySyllables = []
        randWord = ''
        while (len(bySyllables) !=
               numSyllables):  #or (charToExclude in randWord):
            i = random.randrange(0, len(wordsList))
            randWord = wordsList[i]
            bySyllables = hyphenate.hyphenate_word(randWord)
        if len(line1_final) == 0:
            randWord = randWord.capitalize()
        elif len(line1_final) == (len(line1_words) - 1):
            if (random.random() > 0.5):
                randWord = randWord + ','
        line1_final.append(randWord)

    #if(len(line2_prev) == 0):
    for numSyllables in line2_words:
        #get a word from wordsTxt which is the same number of syllables
        bySyllables = []
        randWord = ''
        while (len(bySyllables) !=
               numSyllables):  #or (charToExclude in randWord):
            i = random.randrange(0, len(wordsList))
Example #31
0
def betweensyllables(word):
    return "\u00AD".join(hyphenate.hyphenate_word(word))
def process_data(spark, input_data, output_data, mode):
    """ Ingest the Catalog and process all the files included in it,
        computing reading statistics and storing them in parquet files.

        Currently handles only english titles.
    """
    if mode == 'local':
        catalog_data_url = input_data + "catalog/catalog_mini.csv"
    else:
        catalog_data_url = input_data + "catalog/catalog.csv"

    df = spark.read.csv(catalog_data_url, sep='\t', header=True)

    print(df.printSchema())

    catalog_table = df.select(["title", "author", "language", "id", "_url"]) \
                      .filter("language = 'en'")

    catalog_table = catalog_table.withColumnRenamed("title", "book_title") \
                                 .withColumnRenamed("_url", "location") \
                                 .withColumnRenamed("id", "book_id") \
                                 .distinct()

    catalog_table.write.parquet(
        output_data + "catalog",
        mode='overwrite',
        # partitionBy=["author"],
    )

    # Collect the list of files. This can be large ~20k
    book_url_list = [
        input_data + str(x.location)[6:]
        for x in catalog_table.select("location").collect()
    ]

    # Get list of Book ids from filenames
    files_df = spark.read.text(book_url_list, wholetext=True)

    print("partition size", files_df.rdd.getNumPartitions())

    # Create a book_id column based on filename
    BOOK_IF_UDF = udf(lambda x: x.split("/")[-1].split(".txt")[0],
                      StringType())
    files_df = files_df.withColumn("id", BOOK_IF_UDF(input_file_name()))
    """
     Each Row in the files_df will represent 1 full book length text
     In the following, we split each row's data to find the number of sentences,
     the number of words and the number of syllables, in order to compute the
     grade level and reading ease scores.
    """

    # Regex to detect a sentence from From https://stackoverflow.com/questions/25735644/
    #    python-regex-for-splitting-text-into-sentences-sentence-tokenizing
    SENTENCE_REGEX = "(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s"
    SYLLABLES_UDF = udf(lambda x: len(hyphenate_word(x)), IntegerType())

    # Generate list of sentences, words and syllables for each book.
    sentences = files_df.select(
        "id",
        split("value", SENTENCE_REGEX).alias("sentences"))
    sentence_count = sentences.select(
        "id",
        size("sentences").alias("sentence_count"))

    words_df = files_df.select("id", split("value", ' ').alias("words"))
    word_count = words_df.select("id", size("words").alias("word_count"))
    syllables_count = words_df.select("id", explode("words").alias("syllables")) \
                           .select("id", SYLLABLES_UDF("syllables").alias("syllables")) \
                           .groupby("id").agg(f_sum("syllables").alias("syllables_count"))

    # compute all scores
    results = sentence_count.join(word_count, "id",
                                  "inner").join(syllables_count, "id", "inner")

    results = results.withColumn("grade_level", 0.39 * col("word_count") \
                                    / col("sentence_count") + 11.8 *  \
                                     col("syllables_count") / col("word_count") - 15.59)

    results = results.withColumn("reading_ease", 206.835 - col("word_count") \
                                     / col("sentence_count") - 84.6  * col("syllables_count") \
                                     / col("word_count"))

    results.write.parquet(
        path=output_data + "reading_difficulty",
        mode="overwrite",
    )
        m = sampa_re_oh.search(line)
        if m:
            #print i, title, m.group().replace('&quot;', '"')
            #if type in ('Noun', 'Proper noun'):
            words.add(title)
            #if type == 'Verb':
            #    print i, title
            #type = 'Unknown'
            # uncomment this block when trying new heuristics so that we don't have to scan the whole wiktionary
            # if i > 10000000:
            #     break

print len(words)
output = []
for w in words:
    parts = hyphenate_word(w)
    if len(parts) > 1:
        try:
            # Try a bunch of heuristics to turn the word into something coherant after we've crammed a bro in there
            for i, p in enumerate(parts):
                if i == 0 and p[1] == 'o':
                    parts[i] = 'BRO' + p[2:]
                    raise WordUsed()
                if len(p) == 2 and p[1] == 'o':
                    if i > 0 and p[0] in 'tgvdnl':
                        parts[i] = p[0] + 'BRO'
                    else:
                        parts[i] = 'BRO'
                    raise WordUsed()
                elif p.startswith('o'):
                    parts[i] = 'BR' + p
Example #34
0
def syllabify_orth_with_hyphenate(token, num_sylls=None):
    from hyphenate import hyphenate_word
    return hyphenate_word(token)
    return l
Example #35
0
texts = text_input.split(" \n  \n ")

splitted_texts = []
progressbar = ProgressBar(len(texts))
for text in texts:
    splitted_text = "POEMSTART "
    text = text_to_word_sequence(text,
                                 filters=text_filter,
                                 lower=True,
                                 split=" ")
    for word in text:
        if word == "\n":
            word = "LINEEND"
            splitted_text += " " + word
            continue
        for syllable in hyphenate_word(word):
            splitted_text += " " + syllable
    splitted_text += " POEMEND"
    splitted_texts.append(splitted_text)
    progressbar.count()
print("")

# Create an initial tokenizer
text_tokenizer = Tokenizer(filters=text_filter,
                           lower=True,
                           split=" ",
                           char_level=False)
text_tokenizer.fit_on_texts(splitted_texts)

# Generate a list of words that occur more than n times
# Generate a list of words that occur less than n times
Example #36
0
def syll_count(w):
   return len(hyphenate_word(w))
Example #37
0
 def hyphenate(self):
     self.hyphenated = hyphenate.hyphenate_word(self.full_tok)
Example #38
0
async def on_message(message):
    if client.user.id != message.author.id and not message.author.bot:
        with open('users.json', 'r+') as userfile:
            users = json.loads(userfile.read())
            if not message.author.id in users.keys():
                users[str(message.author.id)] = {"xp": 0, "level": 1}

            #Add xp to a user's file based off of message length and a modifier
            users[str(message.author.id)]["xp"] = int(users[str(
                message.author.id)]["xp"]) + math.floor(
                    len(message.content) / 8) + 10
            if int(users[str(message.author.id)]["xp"]) > int(users[str(
                    message.author.id)]["level"]) * 1000:
                print(message.author.name + " just leveled up!")
                users[str(message.author.id)]["level"] = int(users[str(
                    message.author.id)]["level"]) + 1
                users[str(message.author.id)]["xp"] = 0

                embed = discord.Embed(
                    title="Level Up!",
                    description=str(message.author.display_name) +
                    " is now level " +
                    str(users[str(message.author.id)]["level"]) + "!",
                    color=0xbc42f4)
                if message.author.avatar_url:
                    embed.set_thumbnail(url=message.author.avatar_url)
                await client.send_message(message.channel, embed=embed)
            userfile.seek(0)
            userfile.truncate(
                0
            )  #erases file before dumping the new json. Shouldn't have to do this but we're here arn't we
            json.dump(users, userfile, indent=4)

        if not os.path.isfile(
                os.path.join('settings', str(message.server.id + '.json'))):
            shutil.copy2(
                'settings.json',
                os.path.join('settings', str(message.server.id + '.json')))
            print("created " + str(
                os.path.join('settings', str(message.server.id + '.json'))))

        with open(os.path.join('settings', str(message.server.id + '.json')),
                  'r') as serversettings:
            settings = json.loads(serversettings.read())
            prefix = str(settings["bot"]["prefix"])
            global txtout

            #.test#
            if await checkCommand(settings, "test", message):
                print(':robot:')
                await client.send_message(message.channel, ':robot:')

            #.info#
            elif await checkCommand(settings, "info", message):
                print('BOT INFO')
                embed = discord.Embed(
                    title="BOT INFO",
                    description=
                    "Made by @ShiftyWizard#4823 & @Arboreal#4200 for fun.",
                    url="https://github.com/leaharboreal/bot",
                    color=0x1abc9c)
                embed.set_thumbnail(
                    url=
                    "https://raw.githubusercontent.com/leaharboreal/bot/master/profilepic.png"
                )
                embed.set_footer(
                    text="© bot | 2018",
                    icon_url=
                    "https://raw.githubusercontent.com/leaharboreal/bot/master/profilepic.png"
                )
                await client.send_message(message.channel, embed=embed)

            elif message.content.lower().startswith(
                    prefix + settings["commands"]["settings"]["command"]
            ) and message.author.server_permissions.manage_server:
                #upgrade the file's read permissions to rw#
                serversettings.close()
                with open(
                        os.path.join('settings',
                                     str(message.server.id + '.json')),
                        'r+') as serversettings:
                    #convert the current server's settings.json file into a python object#
                    settings = json.loads(serversettings.read())

                    #set changed to false so if no setting is modified the file will be unchanged#
                    changed = False

                    #check if number of args in message is high enough else reply with syntax error#
                    if len(message.content.lower().split(" ")) > 1:

                        #check if the first argument is commands#
                        if message.content.lower().split(" ")[1] == "commands":

                            #check if number of args in message is high enough for this branch#
                            if len(message.content.lower().split(" ")) > 2:

                                #ensure the command exists and is not the info or settings command#
                                if (message.content.lower().split(" ")[2]
                                        in settings["commands"]
                                    ) and not (message.content.lower().split(
                                        " ")[2] in ["info", "settings"]):

                                    #check if number of args in message is high enough for this branch#
                                    if len(message.content.lower().split(" ")
                                           ) > 3 and len(message.content.lower(
                                           ).split(" ")) < 6:

                                        #check if the user wants to change the command alias#
                                        if message.content.lower().split(
                                                " ")[3] == "command":

                                            #ensure the command contains only safe characters and is smaller or equal to 16 characters#
                                            if re.match(
                                                    r"^[\w\d~!@#$%^&+=;:,./?\*\-]{1,16}$",
                                                    message.content.lower().
                                                    split(" ")[4]):

                                                #set the selected command to the alias#
                                                settings["commands"][
                                                    message.content.lower(
                                                    ).split(" ")[2]][
                                                        "command"] = message.content.lower(
                                                        ).split(" ")[4]

                                                #set changed to true, ensuring the file is saved#
                                                changed = True

                                                #set output message to confirmation#
                                                txtout = message.content.lower(
                                                ).split(
                                                    " "
                                                )[2] + " has been set to " + message.content.lower(
                                                ).split(" ")[4]

                                            #reject command alias#
                                            else:
                                                txtout = "Could not set command. Commands can only 1-16 characters long and contain letters, numbers and these symbols: `~!@#$%^&+=;:,./?*-`"

                                        #check if the user wants to change if command is enabled#
                                        elif message.content.lower().split(
                                                " ")[3] == "enabled":

                                            #check if argument is true#
                                            if message.content.lower().split(
                                                    " ")[4] == "true":

                                                #set the commands enabled value to true#
                                                settings["commands"][
                                                    message.content.lower(
                                                    ).split(" ")
                                                    [2]]["enabled"] = True

                                                #set changed to true, ensuring the file is saved#
                                                changed = True

                                                #set output message to confirmation#
                                                txtout = message.content.lower(
                                                ).split(" ")[
                                                    2] + " is now `enabled`."

                                            #check if answer is false if not true#
                                            elif message.content.lower().split(
                                                    " ")[4] == "false":

                                                #set the commands enabled value to false#
                                                settings["commands"][str(
                                                    message.content.lower(
                                                    ).split(" ")
                                                    [2])]["enabled"] = False

                                                #set changed to true, ensuring the file is saved#
                                                changed = True

                                                #set output message to confirmation#
                                                txtout = message.content.lower(
                                                ).split(" ")[
                                                    2] + " is now `disabled`."

                                            #reject value as it is not true or false#
                                            else:
                                                txtout = "This value can only be set to `true` or `false`."
                                        #unrecognised argument#
                                        else:
                                            txtout = "Incorrect syntax(E:2). `" + prefix + "settings commands " + message.content.split(
                                                " "
                                            )[2] + " <command|enabled> <value>`"
                                    #not enough args#
                                    else:
                                        txtout = "Incorrect syntax (E:1). `" + prefix + "settings commands " + message.content.split(
                                            " "
                                        )[2] + " <command|enabled> <value>`"
                                #unrecognised/locked argument#
                                else:
                                    txtout = "Command `" + message.content.split(
                                        " "
                                    )[2] + "` not found or cannot be modified. Check the github page command list which can be accessed with `" + prefix + "info`"
                            #not enough args#
                            else:
                                txtout = "Incorrect syntax. `" + prefix + "settings commands <commandname> <command|enabled> <value>`"

                        #check if the first argument is bot, if it isn't commands#
                        elif message.content.lower().split(" ")[1] == "bot":

                            #check if number of args in message is high enough for this branch#
                            if len(message.content.lower().split(" ")) > 2:

                                #check if arg is equal to prefix#
                                if message.content.lower().split(
                                        " ")[2] == "prefix":

                                    #check if number of args in message is high enough for this branch#
                                    if len(message.content.lower().split(
                                            " ")) > 3:

                                        #check if user's value matches rule#
                                        if re.match(
                                                r"^[\w\d~!@#$%^&+=;:,./?\*\-]{1,4}$",
                                                message.content.split(" ")[3]):

                                            #set bot prefix to the user value#
                                            settings["bot"][
                                                "prefix"] = message.content.lower(
                                                ).split(" ")[3]

                                            #set changed to true, ensuring the file is saved#
                                            changed = True

                                            #set output message to confirmation#
                                            txtout = "Prefix set. Bot will respond to commands with the prefix `" + message.content.lower(
                                            ).split(
                                                " "
                                            )[3] + "`. To access settings, use the new prefix."

                                        #reject prefix as it does not conform to the character requirements#
                                        else:
                                            txtout = "Could not set prefix. Prefixes can only 1-4 characters long and contain letters, numbers and these symbols: `~!@#$%^&+=;:,./?*-`"

                                    #not enough args#
                                    else:
                                        txtout = "Incorrect syntax. `" + prefix + "settings bot prefix <value>`"

                                #incorrect arg#
                                else:
                                    txtout = "Incorrect syntax. `" + prefix + "settings bot prefix <value>`"

                            #not enough args#
                            else:
                                txtout = "Incorrect syntax. `" + prefix + "settings bot prefix <value>`"

                        #the first argument is incorrect, so respond with a syntax error#
                        else:
                            txtout = "Incorrect syntax. `" + prefix + "settings <commands|bot>`"

                    #not enough args#
                    else:
                        txtout = "Incorrect syntax. `" + prefix + "settings <commands|bot>`"

                    #if the file has been changed at any point, save changes#
                    if changed:
                        serversettings.seek(
                            0)  # reset file position to the beginning.
                        json.dump(settings, serversettings, indent=4)
                        serversettings.truncate()

                    #log output in console then send as discord message#
                    print(txtout)
                    await client.send_message(message.channel, txtout)

            #.purge
            elif await checkCommand(
                    settings, "purge",
                    message) and message.server.me.permissions_in(
                        message.channel).manage_messages:
                if message.author.server_permissions.manage_messages and message.author.server_permissions.manage_server:
                    if len(message.content.lower().split(" ")) == 2:
                        limit = int(message.content.lower().split(" ")[1])
                        if limit <= 100:
                            if await client.purge_from(message.channel,
                                                       limit=limit):
                                await client.send_message(
                                    message.channel, ":warning: Deleted " +
                                    str(limit) + " messages.")
                        else:
                            await client.send_message(message.channel,
                                                      "100 messages max!")
                    else:
                        await client.send_message(
                            message.channel,
                            "Please specify the number of messages!")
                else:
                    await client.send_message(
                        message.channel,
                        "You need at least manage messages and manage server permissions to do that!"
                    )

            #.level#
            elif await checkCommand(settings, "level", message):
                txtout = "Level " + str(users[str(
                    message.author.id)]["level"]) + "\n" + str(users[str(
                        message.author.id)]["xp"]) + " xp"
                embed = discord.Embed(title=str(message.author.display_name),
                                      description=txtout,
                                      color=0x42b3f4)
                if message.author.avatar_url:
                    embed.set_thumbnail(url=message.author.avatar_url)
                await client.send_message(message.channel, embed=embed)

            #.addquote#
            elif await checkCommand(settings, "addquote", message):
                #check if the quotes file exists for the server, if not, create a file with an empty json object#
                if not os.path.isfile(
                        os.path.join('quotes',
                                     str(message.server.id + '.json'))):
                    with open(
                            os.path.join('quotes',
                                         str(message.server.id + '.json')),
                            'a') as f:
                        f.write("{\n}")
                #open the server's quote file#
                with open(
                        os.path.join('quotes',
                                     str(message.server.id + '.json')),
                        'r+') as f:
                    quotes = json.loads(
                        f.read())  #initialize json file as python object#

                    #set quotemessage to the message object before the user's command#
                    quotemessage = await getQuote(message)
                    quote = base64.b64encode(
                        str(quotemessage.content).encode('utf-8')).decode(
                            'utf-8')

                    #ensure quote does not contain any illegal symbols#

                    if quotemessage.author.id in quotes:  #if the user already has a quote object then append quote#
                        quoteid = int(
                            max(quotes[quotemessage.author.id].keys())) + 1
                        quotes[str(quotemessage.author.id)][quoteid] = quote
                    else:  #if they don't have a quote object, create one with their 1st quote#
                        quoteid = 1
                        quotes[quotemessage.author.id] = {}
                        quotes[quotemessage.author.id][quoteid] = quote

                    print("Added Quote to file " + message.server.id +
                          ".json: " +
                          str(quotemessage.content))  #add log of changes#
                    await client.send_message(
                        message.channel, ":white_check_mark: Added quote: `" +
                        str(quotemessage.content) + "`"
                    )  #confirm addition of quote#

                    #seek to start of file before dumping the new json object in the file#
                    f.seek(0)
                    json.dump(quotes, f, indent=4)
                    f.close()  #close the file since we are done with it#

            #.quote#
            elif await checkCommand(settings, "quote", message):
                if os.path.isfile(
                        os.path.join('quotes',
                                     str(message.server.id + '.json'))):
                    with open(
                            os.path.join('quotes',
                                         str(message.server.id + '.json')),
                            'r') as f:
                        quotes = json.loads(f.read())
                        if message.mentions:
                            if message.mentions[0].id in quotes.keys():
                                quoteauthor = message.mentions[0].id
                                txtout = "```" + base64.b64decode(
                                    str(quotes[quoteauthor][random.choice(
                                        list(quotes[quoteauthor].keys()))])
                                ).decode('utf-8') + "```" + message.mentions[
                                    0].mention
                            else:
                                txtout = "Oops! " + message.mentions[
                                    0].mention + " hasn't been quoted on this server yet.\nUse `" + prefix + "addquote` when they say something great."
                        else:
                            quoteauthor = await client.get_user_info(
                                random.choice(list(quotes.keys())))
                            txtout = "```" + base64.b64decode(
                                str(quotes[quoteauthor.id][random.choice(
                                    list(quotes[quoteauthor.id].keys()))])
                            ).decode('utf-8') + "```" + quoteauthor.mention
                else:
                    txtout = "Oops! No quotes available for this server!\nUse `" + prefix + "addquote` to add quotes."
                print(txtout)
                await client.send_message(message.channel, txtout)

            #@SOMEONE#
            elif await checkCommand(settings,
                                    "@someone",
                                    message,
                                    atStart=False):
                x = message.server.members
                members = []
                for member in x:
                    if member.permissions_in(
                            message.channel
                    ).read_messages and member.permissions_in(
                            message.channel).send_messages:
                        members.append(str(member.id))
                someone = random.choice(members)
                print(
                    str(someone) + " was mentioned with @someone by " +
                    str(message.author.id))
                txtout = "<@" + someone + ">" + " was randomly mentioned with @someone!"
                await client.send_message(message.channel, txtout)

            #Colour
            elif await checkCommand(settings, "colour", message,
                                    atStart=False):
                colour = message.content.lower().split(" ")[1]
                if re.match(r"(^#[\d,a-f]{6}$|^#[\d,a-f]{3}$)", colour):
                    colourType = "hex"
                    colour = colour[1:]
                elif re.match(
                        r"rgb\((25[0-5]|2[0-4]\d|[0,1]\d\d|\d{1,2}),(25[0-5]|2[0-4]\d|[0,1]\d\d|\d{1,2}),(25[0-5]|2[0-4]\d|[0,1]\d\d|\d{1,2})\)",
                        colour):
                    colourType = "rgb"
                elif re.match(
                        r"hsl\((25[0-5]|2[0-4]\d|[0,1]\d\d|\d{1,2}),(100|\d{1,2})%,(100|\d{1,2})%\)",
                        colour):
                    colourType = "hsl"
                elif re.match(
                        r"cmyk\((100|\d{1,2}),(100|\d{1,2}),(100|\d{1,2}),(100|\d{1,2})\)",
                        colour):
                    colourType = "cmyk"
                else:
                    colourType = None
                if colourType:
                    print("http://thecolorapi.com/id?" + colourType + "=" +
                          colour)
                    with urllib.request.urlopen("http://thecolorapi.com/id?" +
                                                colourType + "=" +
                                                colour) as url:
                        data = json.loads(url.read().decode())
                        colourHEX = data["hex"]["value"]
                        colourRGB = data["rgb"]["value"]
                        colourHSL = data["hsl"]["value"]
                        colourCMYK = data["cmyk"]["value"]
                        embed = discord.Embed(title=data["name"]["value"],
                                              colour=discord.Colour(
                                                  int(data["hex"]["value"][1:],
                                                      16)))
                        embed.add_field(name="Hex",
                                        value=colourHEX,
                                        inline=True)
                        embed.add_field(name="RGB",
                                        value=colourRGB,
                                        inline=True)
                        embed.add_field(name="HSL",
                                        value=colourHSL,
                                        inline=True)
                        embed.add_field(name="CMYK",
                                        value=colourCMYK,
                                        inline=True)
                        #embed.set_thumbnail(url=data["image"]["bare"])
                        embed.set_footer(text="Sourced using thecolorapi")
                        await client.send_message(message.channel, embed=embed)
                else:
                    await client.send_message(message.channel,
                                              "Format not recognised :(")

            #RETURN STRING WITH SPACES EVERY CHARACTER#
            elif await checkCommand(settings, "widespace", message):
                txtout = ""
                x = ""
                for char in message.content[
                        len(prefix +
                            settings["commands"]["widespace"]["command"]):]:
                    if ord(char) in range(33, 127):
                        x = chr(ord(char) + 0xFEE0)
                    else:
                        x = char
                    txtout = txtout + str(x) + " "
                print(txtout)
                await client.send_message(message.channel, txtout)

            #VERBOSE MESSAGE GENERATOR#
            elif await checkCommand(settings, "verbose", message):
                txtin = message.content[
                    len(prefix + settings["commands"]["verbose"]["command"]):]
                txtout = ""
                synonyms = []
                for word in txtin.split():
                    for syn in wordnet.synsets(word):
                        for l in syn.lemmas():
                            synonyms.append(l.name())
                    if not synonyms:
                        txtout += (" " + word)
                    else:
                        txtout += (" " + max(set(synonyms), key=len))
                    synonyms = []
                txtout = txtout.replace("-", " ")
                txtout = txtout.replace("_", " ")
                print(txtout)
                await client.send_message(message.channel, txtout)

            #SUCCINCT MESSAGE GENERATOR#
            elif await checkCommand(settings, "succinct", message):
                txtin = message.content[
                    len(prefix + settings["commands"]["succinct"]["command"]):]
                txtout = ""
                synonyms = []
                for word in txtin.split():
                    for syn in wordnet.synsets(word):
                        for l in syn.lemmas():
                            synonyms.append(l.name())
                    if not synonyms:
                        txtout += (" " + word)
                    else:
                        txtout += (" " + min(set(synonyms), key=len))
                    synonyms = []
                txtout = txtout.replace("-", " ")
                txtout = txtout.replace("_", " ")
                print(txtout)
                await client.send_message(message.channel, txtout)

            #SMUSH TWO WORDS TOGETHER#
            elif await checkCommand(settings, "smush", message):
                worda = message.content.split(" ")[1]
                wordb = message.content.split(" ")[2]
                asyl = hyphenate_word(worda)
                bsyl = hyphenate_word(wordb)
                if len(bsyl) > 1:
                    txtout = str(asyl[0]) + "".join(bsyl[1:])
                else:  #in case word 2 is only 1 syllable, just join the words
                    txtout = str(asyl[0]) + "".join(bsyl[0])
                txtout = worda + " " + wordb + " (" + txtout + ")"
                print(txtout)
                await client.send_message(message.channel, txtout)

            elif await checkCommand(settings, "dog", message):
                with urllib.request.urlopen(
                        "https://dog.ceo/api/breeds/image/random") as url:
                    data = json.loads(url.read().decode())
                    embed = discord.Embed(color=0xeee657)
                    embed.set_image(url=data["message"])
                    print(data["message"])
                    await client.send_message(message.channel, embed=embed)

            elif await checkCommand(settings, "catfact", message):
                with urllib.request.urlopen(
                        "https://cat-fact.herokuapp.com/facts/random") as url:
                    data = json.loads(url.read().decode())
                    txtout = data["text"]
                    print(txtout)
                    await client.send_message(message.channel, txtout)

            #CHOOSE FROM USER SPECIFIED LIST#
            elif await checkCommand(settings, "choose", message):
                items = message.content[
                    len(prefix + settings["commands"]["choose"]["command"]):]
                txtout = random.choice(items.split("|"))
                print(txtout)
                await client.send_message(message.channel, txtout)

            #"RATE" SOMETHING BY PICKING A NUMBER FROM 1 TO 10#
            elif await checkCommand(settings, "rate", message):
                txtout = "I\'d rate " + str(message.content[len(
                    prefix +
                    settings["commands"]["rate"]["command"]):]) + " **" + str(
                        random.randrange(10)) + " out of 10!**"
                print(txtout)
                await client.send_message(message.channel, txtout)

            #FLIP A COIN#
            elif await checkCommand(settings, "flip", message):
                embed = discord.Embed(title="Flip",
                                      description=random.choice(
                                          ['Heads', 'Tails']),
                                      color=0xeee657)
                await client.send_message(message.channel, embed=embed)

            #PRINT EMOTES#
            elif await checkCommand(settings, "emotes", message):
                txtout = ""
                with open('emotes.txt', 'r') as file:
                    lines = list(file)
                for x in range(int(message.content.lower().split(" ")[1])):
                    txtout += str(random.choice(lines)).rstrip()
                print(message.content.lower().split(" ")[1] + " emojis")

                await client.send_message(message.channel, txtout)

            #PERFORM XKCD37#
            elif settings["commands"]["xkcd37"][
                    "enabled"] == True and re.search(xkcd37, message.content):
                txtout = re.sub(xkcd37, xkcd37sub, message.content, 0)
                txtout = "```> " + txtout + "``` xkcd 37"
                print(txtout)
                await client.send_message(message.channel, txtout)

            #DADBOT#
            elif settings["commands"]["dadbot"]["enabled"] == True and (
                    message.content.lower().split(" ")[0] == 'i\'m'
                    or message.content.lower().split(" ")[0] == 'im'):
                txtout = "Hi " + " ".join(
                    message.content.split(" ")[1:]) + ", I'm Dad"
                print(txtout)
                await client.send_message(message.channel, txtout)

            #REPLY TO COMMENTS ABOUT BOT#
            elif 'bot' in message.content.lower().split(
                    " ") and settings["commands"]["bot"]["enabled"] == True:
                txtin = sid.polarity_scores(message.content)
                if float(txtin['compound']) >= 0.2:
                    txtout = ':heart:'
                elif float(txtin['compound']) <= 0.15 and float(
                        txtin['compound']) >= -0.15:
                    txtout = 'I am unfeeling'
                else:
                    txtout = 'no u'
                print(txtout + ' ' + str(txtin['compound']))
                await client.send_message(message.channel, txtout)

            elif 'oopsie' in message.content.lower().split(
                    " ") and settings["commands"]["oopsie"]["enabled"] == True:
                txtout = "OOPSIE WOOPSIE!! Uwu We make a fucky wucky!! A wittle fucko boingo! The code monkeys at our headquarters are working VEWY HAWD to fix this!"
                print(txtout)
                await client.send_message(message.channel, txtout)

            #GARBAGE MEME#
            elif settings["commands"]["peestream"][
                    "command"] in message.content.lower(
                    ) and settings["commands"]["peestream"]["enabled"] == True:
                embed = discord.Embed(color=0xeee657)
                embed.set_image(
                    url=
                    "https://cdn.discordapp.com/attachments/260061122193784833/404628539728723969/chrome_2018-01-07_20-25-17.jpg"
                )
                print("Pee Stream")
                await client.send_message(message.channel, embed=embed)
            txtout = ""
Example #39
0
def syllabify_orth_with_hyphenate(token,num_sylls=None):
	from hyphenate import hyphenate_word
	l=hyphenate_word(token)
	if not num_sylls or len(l)==num_sylls:
		return l
	return []
Example #40
0
def hyphenate_word(word):
    """Adapter around the hyphenate function"""
    return len(hyphenate.hyphenate_word(word))
Example #41
0
                    audioisvalid = False
                    print("Sample", audio_name, "not found in recombine loop",
                          audio_pathlist, "was pathlist", en_word, "is",
                          oz_word)
            export_word()
            # make sure audio segment is empty for next word
            combined_audio = AudioSegment.empty()
            phonemes = str(phonemes).replace('[', '').replace(']', '').replace(
                "'", '').replace('"', '').replace(',', '')
            # update the lexicon dataframe if the word is not already known
            if not df4['oz_word'].eq(oz_word).any():
                df4.loc[len(df4.index)] = [oz_word, phonemes]

        else:
            # print(en_word, " - no translation found in dataframe")
            en_hyphens = hyphenate_word(en_word)
            trl_word = en_word
            trl_hyphens = en_hyphens
            temp_hyphens = []
            for hyphen in en_hyphens:
                for key in forbidden_letters.keys():
                    hyphen = hyphen.replace(key, forbidden_letters[key])
                temp_hyphens.append(hyphen)
            for key in forbidden_letters.keys():
                trl_word = trl_word.replace(key, forbidden_letters[key])
            # print(trl_word)
            hyphens = temp_hyphens
            if len(en_hyphens) == 1:
                word_len = len(str(en_word))
                hyphens_len = math.sqrt(len(trl_word))
                hyphens_rounded = int(hyphens_len)