def syllablize(poem): # syllablizer setup if not is_installed(language): install(language) hyph = Hyphenator(language) # output dict to send back through API output = [] for line in poem: # list of words in line words = line.split() syllablized_line = [] for word in words: syls = hyph.syllables(word) new_word = "" if len(syls) == 0: new_word = word else: for syl in syls: new_word += syl new_word += " " syllablized_line.append(new_word.strip()) if len(syllablized_line) > 0: output.append(syllablized_line) return output
def hyphenate(value, arg=None, autoescape=None): # Default minimal length minlen = 6 if arg: args = arg.split(u',') code = args[0] # Override minimal length, if specified if len(args) > 1: minlen = int(args[1]) else: # No language specified, use Django's current code = get_language() # Normalize the locale code, ignoring a potential encoding suffix lang = locale.normalize(code).split('.')[0] # Make sure the proper language is installed if not dictools.is_installed(lang): dictools.install(lang) h = Hyphenator(lang) new = [] for word in value.split(u' '): if len(word) > minlen and word.isalpha(): new.append(u'­'.join(h.syllables(word))) else: new.append(word) result = u' '.join(new) return mark_safe(result)
def build_sentence_info(timestamps, sentence, sent_dict): ''' Build sentence info from timestamps, sentence text and sentiment lexicon :param timestamps: :param sentence: :param sent_dict: :return: ''' # for test # print sentence h_en = Hyphenator('en_US') info_list = [] # words = re.split('\W+', sentence) words = re.split('[,.!?\r\n ]+', sentence) # print words # print len(words) # print len(timestamps) words.remove('') words_with_punct = sentence.split() for ind, word in enumerate(words): if word in sent_dict: c_sentiment = sent_dict[word] else: c_sentiment = 0 punct = '' if words_with_punct[ind] != word: punct = words_with_punct[ind][-1] num = t2n.text2num(word) info_list.append( (word, timestamps[ind * 2], timestamps[ind * 2 + 1], len(h_en.syllables(unicode(word))), c_sentiment, punct, num)) return info_list
def syllabizeNames(nameList): tempList = [] for lang in ['en_US']: if not is_installed(lang): install(lang) en_US = Hyphenator('en_US') for item in nameList: tempList.append(en_US.syllables(item)) return tempList
def main(arguments: List[str] = None): namespace = parser.parse_args(arguments) command = namespace.command if command == 'export_font': from .pdf import PDF glyphs = set(GLYPHS) cwd = pathlib.Path('.') if namespace.text is not None: for text_glob in namespace.text: for text_file in cwd.glob(text_glob): print(f'Taking glyphs from:\n {text_file}') glyphs.update(set(text_file.read_text('utf-8'))) font = PDF.font(namespace.font_name, namespace.font_size, glyphs=glyphs) font.export(namespace.output) elif command == 'tester': from .tester import main main(namespace) elif command == 'hyphenate': text = namespace.input.read().decode() hyphenator = Hyphenator(language=namespace.language) for token_type, text in tokenize(text): if token_type is TokenType.WORD: syllables = hyphenator.syllables(text) or [text] namespace.output.write_chunk('-'.join(syllables).encode()) else: namespace.output.write_chunk(text.encode()) elif command == 'render': import json from .printer import Page, FontSpec, Fragment from .pdf import PDF text = namespace.input.read() raw_pages = text.split('\0\n') pages = [] for raw_page in raw_pages: if not raw_page: continue page_data = json.loads(raw_page) font_spec = FontSpec(page_data['font_spec']['name'], page_data['font_spec']['size']) paper_width = page_data['paper_width'] paper_height = page_data['paper_height'] fragments = [ Fragment(**fragment) for fragment in page_data['fragments'] ] page = Page(font_spec, paper_width, paper_height, fragments) pages.append(page) pdf = PDF(namespace.output) pdf.render(pages) pdf.finish()
def encode(self, word): num_string = "" h_mx = Hyphenator('es_MX') for syllable in h_mx.syllables(unicode(word)): for idx, pattern in enumerate(self.patterns): for regex in pattern: if re.match(regex, syllable): num_string += str(idx) return num_string
def get_syllables(lyrics): h = Hyphenator() syllables = [] for word in lyrics.split(" "): syl = h.syllables(word) if syl: syllables.append(syl) else: syllables.append([word]) return syllables
def by_syllable(input_gen, lang, install_lang_p): if install_lang_p and not dictools.is_installed(lang): dictools.install(lang) hyphenator = Hyphenator(lang) for word in input_gen: syllables = hyphenator.syllables(word) logging.debug("syllables: {}".format(syllables)) for syllable in syllables: yield syllable
def getUniqueSyllables(sonnets): h = Hyphenator('en_GB') s = set() for sonnet in sonnets: for sentence in sonnet: for word in sentence: syllables = h.syllables(unicode(word.lower())) if len(syllables) < 2: s.add(unicode(word.lower())) else: s |= set(syllables) return(list(s))
def tokenize_word_to_syllables(word, lang): global hyphenator if hyphenator is None: print('Initializing Hyphenator (' + lang + ')...') hyphenator = Hyphenator(lang) syllables = hyphenator.syllables(word) # Word with only one syllable need special treatment, # because the hyphenator does not recognize them if len(syllables) == 0: syllables = [word] return syllables
def hyphenate(value, arg=None, autoescape=None): if autoescape: esc = conditional_escape else: esc = lambda x: x minlen = 7 if arg: args = arg.split(u',') code = args[0] if len(args) > 1: minlen = int(args[1]) else: code = settings.LANGUAGE_CODE # # Looks like this is assuming that the language code will arrive as 'xx- # YY'. In our case, it will arrive as simply 'en', so we MUST expand this # into a locale in order to work with PyHyphen. # # TODO: This should probably be a lookup against a dict in settings? s = code.split(u'-') if len(s) == 1: if s[0] == 'en': s.append(u'US') elif s[0] == 'bg': s.append(u'BG') lang = s[0].lower() + u'_' + s[1].upper() if not dictools.is_installed(lang): dictools.install(lang) h = Hyphenator(lang) new = [] for word in value.split(u' '): if len(word) > minlen and word.isalpha(): new.append(u'­'.join(h.syllables(word))) else: new.append(word) result = u' '.join(new) return mark_safe(result)
class hyphenator: def __init__(self, language='it_IT'): self.h = Hyphenator(language) def split_syllables(self, word): syllables = self.h.syllables(utils.check_unicode(word)) return syllables def split_word(self, word): pairs = self.h.pairs(utils.check_unicode(word)) return pairs
def parse_word(self, word): """Returns syllables and stress of each syllable if exists, else None. First tries using NLTK cmudict, if failes then uses pyhyphen.""" syl_stress = None try: word_info = self.cmu_dict[word.lower()][0] # no way to differentiate between different pronunciation syl_num = len(list(y for y in word_info if y[-1].isdigit())) syl_stress = list(int(y[-1]) for y in word_info if y[-1].isdigit()) except KeyError: h_en = Hyphenator('en_GB') syl_num = len(h_en.syllables(unicode(word))) if syl_num == 0: syl_num = 1 return syl_num, syl_stress
class EnSyllabSortedTokenizer(): def __init__(self, stopwords): self.preprocessor = preprocessor.Preprocessing(stopwords) self.syllbler = Hyphenator('en_US') def tokenize(self, text, use_preproc=False, use_stem=False, use_lemm=False, check_length=True, check_stopwords=True): preprocessed_text = text if use_preproc: preprocessed_text, _ = self.preprocessor.preproc( text, use_lemm=use_lemm, use_stem=use_stem, check_stopwords=check_stopwords, check_length=check_length) syllables = [] for word in preprocessed_text.split(): tokens = self.syllbler.syllables(word) syllables += [''.join(sorted(token)) for token in tokens] return syllables
def main(): hyphenator = Hyphenator('en_GB') with open(filename, 'r', encoding='utf-8') as f: chapters = parse_chapters(f) with open(filename, 'w', encoding='utf-8', newline='\n') as f: for chapter_idx, (chapter_name, entries, head_eager_code, tail_eager_code) in enumerate(chapters): print(chapter_name) f.write(f'@<|\n{head_eager_code}\n|>\n') for entry_idx, (code, chara_name, dialogue) in enumerate(entries): if code: f.write(f'<|\n{code}\n|>\n') if dialogue: dialogue = normalize(dialogue) dialogue = add_soft_hyphens(hyphenator, dialogue) dialogue = add_nbsp(dialogue) if chara_name: f.write(f'{chara_name}::{dialogue}\n') else: f.write(dialogue + '\n') if entry_idx < len(entries) - 1: f.write('\n') if tail_eager_code: f.write(f'@<|\n{tail_eager_code}\n|>\n') else: f.write('@<||>\n') if chapter_idx < len(chapters) - 1: f.write('\n')
def make_dicts(filename): syllables = {} with open(filename) as theFile: f = theFile.read() f = f.lower() f = f.replace('\r', '\n') lines = f.split("\n") for line in lines: if line != "": line = line.split('\\') if line[1] not in syllables and len(line[1].split()) == 1: syllables[line[1]] = line[-1].split("-") for lang in ['en_US']: if not is_installed(lang): install(lang) #other dict h_en = Hyphenator('en_US') return (syllables, h_en)
class TextGenerator: def __init__(self, generatorName, trainString, prefixLength): self.generatorName = generatorName self.chain = MarkovChain() self.chain.generateDatabase(trainString, n=prefixLength) self.currState = [] self.hyphenator = Hyphenator('en_US') self.syllableQ = Queue() self.stripPattern = re.compile('[\W_]+') while (len(self.currState) < prefixLength): self.currState = self.chain.generateString().split()[-(prefixLength+1):-1] def load_next_word(self): nextword = "" try: while nextword == "": nextword = self.stripPattern.sub('', self.chain._nextWord(self.currState)) self.currState = self.currState[1:] self.currState.append(nextword) if len(nextword) < 4: # because hyphenator doesnt work for words less than 4 letters self.syllableQ.put(nextword) else: for syllable in self.hyphenator.syllables(nextword): self.syllableQ.put(syllable) except UnicodeEncodeError: print("unicode error") def get_next_syllable(self): if (self.syllableQ.empty()): self.load_next_word() return self.syllableQ.get()
class Word(BasicText): """Represents a Word.""" if "hyphen" in sys.modules: h_en = Hyphenator('en_US') def __init__(self, text): """Initializes a Word.""" self.text = text @BasicText.text.setter def text(self, new_text): self._text = new_text.strip(""" (),.?!;:\"\'""") def count_syllables(self): """ Counts the number of syllables for an English language Word. """ try: n_syllables = len(Word.h_en.syllables(self.text)) if n_syllables > 0: return n_syllables except ValueError: # Thrown by syllables function for words longer than 100 characters long. return 30 return 1 def is_adverb(self): """Determines whether word is an adverb.""" return re.match(r"\w+ly", self.text)
def __init__(self, text, margin, indent, lang="en_US", hyphen_char="\u2010"): Text.__init__(self, text, lang) self.margin = margin self.indent = indent if self.lang in dict_info.keys(): self.hyphenator = Hyphenator(self.lang) else: self.hyphenator = None self.hyphen_char = hyphen_char self.header.c = " " * self.indent
def syllablize(line): """ take a line and split it into a list of syllables """ hyph_en = Hyphenator('en_US') syll_list = [] #get words separately + count hyphenated words as 2 words words = line.replace("-", " ").split() for word in words: #remove common punctuation word = word.replace(",", "").replace(":", "").replace(";", "") syllables = hyph_en.syllables(word) if not syllables: #pyhyphen sometimes returns 1 syllable words back to you, #but sometimes return an empty list... don't know why syll_list.append(word) for syll in syllables: syll_list.append(syll) return syll_list
def __init__(self, generatorName, trainString, prefixLength): self.generatorName = generatorName self.chain = MarkovChain() self.chain.generateDatabase(trainString, n=prefixLength) self.currState = [] self.hyphenator = Hyphenator('en_US') self.syllableQ = Queue() self.stripPattern = re.compile('[\W_]+') while (len(self.currState) < prefixLength): self.currState = self.chain.generateString().split()[-(prefixLength+1):-1]
def word_phonic_dict_func(self): ''' Output: Ordered dictionary Keys - word Value - phonetic representation of the key ''' h_en = Hyphenator('en_US') for line in self.lyrics_tokenized: for word in line: if word not in self.arpabet_dict.keys(): try: self.arpabet_dict.update( {word: pr.phones_for_word(word)[0]}) temp = h_en.syllables(unicode(word)) if len(temp) > 0: self.word_syl_dict.update({word: temp}) else: self.word_syl_dict.update({word: [unicode(word)]}) except Exception as e: print e
def sylTokenizer(text): words = wordTokenizer(text) if language == 'en': en = Hyphenator('en_US') syl_split = map(lambda x: en.syllables(x) if (len(x) > 1 and len(en.syllables(x)) > 0) else [x], words) comb_syl_split = map(lambda x: ["".join(x[i:i + ngrams]) for i in range(max(len(x) - ngrams + 1, 1)) ], syl_split) return reduce(lambda x, y: x + y, comb_syl_split) elif language == 'te': te = Syllabifier() syl_split = map(lambda x: te.syllabify_te(x) if (len(x) > 1 and len(te.syllabify_te(x)) > 0) else [x], words) comb_syl_split = map(lambda x: ["".join(x[i:i + ngrams]) for i in range(max(len(x) - ngrams + 1, 1)) ], syl_split) return reduce(lambda x, y: x + y, comb_syl_split) else: hi = Syllabifier() syl_split = map(lambda x: hi.syllabify_hi(x) if (len(x) > 1 and len(hi.syllabify_hi(x)) > 0) else [x], words) comb_syl_split = map(lambda x: ["".join(x[i:i + ngrams]) for i in range(max(len(x) - ngrams + 1, 1)) ], syl_split) return reduce(lambda x, y: x + y, comb_syl_split)
def main(args): if args.quantize and args.device != "cpu": raise RuntimeError("Quantization only available on CPU devices") port = args.port or os.environ.get("PORT") or 8000 for handler in logging.root.handlers[:]: logging.root.removeHandler(handler) lvl = logging.DEBUG if args.verbose else logging.INFO logging.basicConfig(level=lvl) word_generator = WordGenerator( device=args.device, forward_model_path=args.forward_model_path, inverse_model_path=args.inverse_model_path, blacklist_path=args.blacklist_path, quantize=args.quantize, is_urban=False, ) urban_generator = None if args.forward_urban_model_path: logging.info(f"Creating urban model from {args.forward_urban_model_path}") urban_generator = WordGenerator( device=args.device, forward_model_path=args.forward_urban_model_path, inverse_model_path=None, blacklist_path=args.blacklist_path, quantize=args.quantize, is_urban=True, ) h_en = Hyphenator('en_US') logging.info(f"Warming up with word generation") gen_word = word_generator.generate_word() logging.info(f"Generated {gen_word}") server = grpc.server(futures.ThreadPoolExecutor(max_workers=10)) wordservice_pb2_grpc.add_WordServiceServicer_to_server( WordServiceServicer(word_generator, h_en, urban_generator=urban_generator), server ) server.add_insecure_port("[::]:{}".format(port)) server.start() logging.info(f"Listening on port {port}") try: while True: time.sleep(3600 * 24) except KeyboardInterrupt: server.stop(args.shutdown_grace_duration)
class Nomen: def __init__(self): self.hyphen = Hyphenator('en_US') pass def load(self, file="./data/data-1.txt"): """ load training data""" self.data = Data() if self.data: print "Data loaded success" print str(self.data) def train(self): pass def rank(self): pass def get(self, en_name): en_name = en_name.lower() # lookup = self.data.find(en_name) # if lookup: # return lookup syll = self.hyphen.syllables(en_name) split_onsets(syll) split_codas(syll) split_glides(syll) split_mcs(syll) expand_dipththongs(syll) print "Syllables:", syll return self.backward_max_matching(0, syll) def backward_max_matching(self, i, syll): if i >= (len(syll)): return '' if not syll or len(syll) == 0: return '' lx = self.data.lexicons key = ''.join(syll[i:]) print "try:", key if key in lx: candidate = self.rank(lx[key], 0) print "find:", key, candidate return self.backward_max_matching(0,syll[0:i]) + self.rank(lx[key], 0) else: return self.backward_max_matching(i+1, syll) def rank(self, l, i): rl = sorted(l, reverse=True, key=lambda x:x[1]) return rl[i][0]
def build_sentence_info(timestamps, sentence, sent_dict): ''' Build sentence info from timestamps, sentence text and sentiment lexicon :param timestamps: :param sentence: :param sent_dict: :return: ''' # for test # print sentence h_en = Hyphenator('en_US') info_list = [] # words = re.split('\W+', sentence) words = re.split('[,.!?\r\n ]+', sentence) # print words # print len(words) # print len(timestamps) words.remove('') words_with_punct = sentence.split() for ind, word in enumerate(words): if word in sent_dict: c_sentiment = sent_dict[word] else: c_sentiment = 0 punct = '' if words_with_punct[ind] != word: punct = words_with_punct[ind][-1] num = t2n.text2num(word) info_list.append((word, timestamps[ind * 2], timestamps[ind * 2 + 1], len(h_en.syllables(unicode(word))), c_sentiment, punct, num)) return info_list
def build_sentence_data(title, timestamps, sentence, sent_dict): ''' Build sentence info from timestamps, sentence text and sentiment lexicon :param timestamps: :param sentence: :param sent_dict: :return: a SentenceData object contain text-based information about the sentence ''' # for test # print sentence s = SentenceData(title, sentence) s.words = [] h_en = Hyphenator('en_US') words = re.split('[,.!?\r\n ]+', sentence) words.remove('') words_with_punct = sentence.split() for ind, word in enumerate(words): if word in sent_dict: c_sentiment = sent_dict[word] else: c_sentiment = 0 punct = '' if words_with_punct[ind] != word: punct = words_with_punct[ind][-1] num = t2n.text2num(word) if num == -1: num = '' else: num = str(num) w = WordData(word, float(timestamps[ind * 2]), float(timestamps[ind * 2 + 1]), c_sentiment, len(h_en.syllables(unicode(word))), punct, num) s.words.append(w) return s
def __init__(self, text = 'Defualt Tweet'): # only keep latin chars: self.rawText = re.sub(ur'[^\x00-\x7F]', u'', text) self.textWords = self.rawText.split() self.h_en = Hyphenator('en_US') self.badSymbols = ['http:', 'https:', '&'] self.forbiddenThings = ['@'] # random syms self.forbiddenWords = ['el', 'la', 'en', 'tu', # spanish 'Et', 'le', 'aux', 'les', 'de', 'des', 'du', 'il', 'Elle', 'ses', 'sa', 'ces', 'cela', 'est', 'vous', 'tous', 'nous', 'allez', 'alons'] # french self.forbiddenEnds = ['the', 'and', 'a', 'an', 'for', 'at', 'except', 'or', 'has', 'my', 'your', 'their', 'his', 'hers', 'her\'s', 'get', 'it\'ll', 'to', 'like', 'is', 'I']
def smog_score(text=None, abbr=None, hyphen=None, vars={}): """Calculate SMOG score.""" if text: if not abbr: abbr = Abbreviations() if not hyphen: hyphen = Hyphenator('en_US') text = punct_clean(text, abbr) vars['sent_count'] = sent_count(text, abbr, True) text = word_array(text, abbr, True) vars['polysyblword_count'] = sybl_counts(text, abbr, hyphen, True)['polysyblword_count'] return 3.1291 + 1.0430 * sqrt( 30 * (vars['polysyblword_count'] / float(vars['sent_count'])))
def getSyllables(request, strParm): from hyphen import Hyphenator #your language english h_en = Hyphenator('en_US') #this makes sure the words come out in english style = 'utf-8' wordList.extend(word.strip() for word in wordList.replace("\n", "").split(",")) words =[] #for each words in your word list for word in wordList: #this cuts the word into syllables brokenUpWord = '-'.join(h_en.syllables(word.decode(style))) #this gets the count of syllables countOfSyllables = str(len(str(brokenUpWord).split('-'))) #print them out words.extend(brokenUpWord +';', countOfSyllables + ' syllable' + ('s' if countOfSyllables>1 else '') +'\n') return HttpResponse( json.dumps({'words': words}), content_type='application/json')
def sybl_counts(text, abbr=Abbreviations(), hyphen=Hyphenator('en_US'), prepped=False): """Count number of syllables in text, return in sybl_count; count number of words with three or more syllables, return in polysyblword_count. """ if not prepped: text = word_array(text, abbr) sybl_count = 0 polysyblword_count = 0 for word in text: syblperword_c = max(1, len(hyphen.syllables(word))) sybl_count += syblperword_c if syblperword_c >= 3: polysyblword_count += 1 return {'sybl_count': sybl_count, 'polysyblword_count': polysyblword_count}
def __init__(self, text='Defualt Tweet'): # only keep latin chars: self.rawText = re.sub(ur'[^\x00-\x7F]', u'', text) self.textWords = self.rawText.split() self.h_en = Hyphenator('en_US') self.badSymbols = ['http:', 'https:', '&'] self.forbiddenThings = ['@'] # random syms self.forbiddenWords = [ 'el', 'la', 'en', 'tu', # spanish 'Et', 'le', 'aux', 'les', 'de', 'des', 'du', 'il', 'Elle', 'ses', 'sa', 'ces', 'cela', 'est', 'vous', 'tous', 'nous', 'allez', 'alons' ] # french self.forbiddenEnds = [ 'the', 'and', 'a', 'an', 'for', 'at', 'except', 'or', 'has', 'my', 'your', 'their', 'his', 'hers', 'her\'s', 'get', 'it\'ll', 'to', 'like', 'is', 'I' ]
def fleschkincaid_score(text=None, abbr=None, hyphen=None, vars={}): """Calculate Flesch-Kincaid score.""" if text: if not abbr: abbr = Abbreviations() if not hyphen: hyphen = Hyphenator('en_US') text = punct_clean(text, abbr) vars['sent_count'] = sent_count(text, abbr, True) text = word_array(text, abbr, True) vars['word_count'] = word_count(text, abbr, True) vars['sybl_count'] = sybl_counts(text, abbr, hyphen, True)['sybl_count'] return -15.59 + 0.39 * (vars['word_count'] / vars['sent_count']) + 11.8 * ( vars['sybl_count'] / vars['word_count'])
def _set_lang_dict(self): if self.dict_download: try: if not is_installed(self.lang_code): if self.verbose: print(Msg.DICT_INSTALL(self.lang_code)) install(self.lang_code) self.lang_dict = Hyphenator(self.lang_code) except: pass if self.verbose: if is_installed(self.lang_code): print(Msg.DICT_INSTALLED(self.lang_code)) else: print(Msg.DICT_INSTALL_FAILED(self.lang_code))
def parse(self): p = Pinyin() s = Hyphenator('en_US') with codecs.open(self.filepath, encoding='utf-8', mode='r') as f: for line in f: self.count = self.count + 1 line = line[0:-1] words = line.split() if len(words) != 2: print "Error on line", self.count raise ValueError c = words[0].strip() e = words[1].strip().lower() self.ch.append(c) self.pinyin.append(p.get_pinyin(c, ' ').split()) self.en.append(e) if len(e) > 3: syll= s.syllables(e) syll = self.sub_syllables(e, c, syll) else: syll = [e] self.syllables.append(syll)
def gunningfog_score(text=None, abbr=None, hyphen=None, vars={}): """Calculate Gunning Fog score.""" if text: if not abbr: abbr = Abbreviations() if not hyphen: hyphen = Hyphenator('en_US') text = punct_clean(text, abbr) vars['sent_count'] = sent_count(text, abbr, True) text = word_array(text, abbr, True) vars['word_count'] = word_count(text, abbr, True) vars['polysyblword_count'] = sybl_counts(text, abbr, hyphen, True)['polysyblword_count'] return 0.4 * ((vars['word_count'] / float(vars['sent_count'])) + 100 * (vars['polysyblword_count'] / float(vars['word_count'])))
def flesch_score(text=None, abbr=None, hyphen=None, vars={}): """Calculate Flesch Reading Ease score.""" if text: if not abbr: abbr = Abbreviations() if not hyphen: hyphen = Hyphenator('en_US') text = punct_clean(text, abbr) vars['sent_count'] = sent_count(text, abbr, True) text = word_array(text, abbr, True) vars['word_count'] = word_count(text, abbr, True) vars['sybl_count'] = sybl_counts(text, abbr, hyphen, True)['sybl_count'] return 206.835 - 1.015 * (vars['word_count'] / float( vars['sent_count'])) - 84.6 * (vars['sybl_count'] / float(vars['word_count']))
def test_beautiful(self): h_en = Hyphenator('en_US') self.assertEqual([['beau', 'tiful'], [u'beauti', 'ful']], h_en.pairs('beautiful')) self.assertEqual(['beau-', 'tiful'], h_en.wrap('beautiful', 6)) self.assertEqual(['beauti-', 'ful'], h_en.wrap('beautiful', 7)) self.assertEqual(['beau', 'ti', 'ful'], h_en.syllables('beautiful'))
def main(): parser = argparse.ArgumentParser( description="Wrap text file to given width, with hyphenation" ) parser.add_argument("-w", "--width", type=int, default=70, help="Maximum line width") parser.add_argument("-l", "--language", default="en_US", help="Text file locale") parser.add_argument("path", help="Text file path. Use '-' to read from standard input.") args = parser.parse_args() hyphenator = Hyphenator(args.language) if args.path == "-": for content in sys.stdin: for line in textwrap2.wrap(content, width=args.width, use_hyphenator=hyphenator): print(line) else: with open(args.path) as f: for line in textwrap2.wrap(f.read(), width=args.width, use_hyphenator=hyphenator): print(line)
class HyphenationIntroducer: def __init__(self, p_hyphen: float): self.p_hyphen = p_hyphen self.hyphenator = Hyphenator() def get_candidates(self, token: str) -> List[str]: try: return self.hyphenator.pairs(token) except: return [] def introduce_hyphens(self, text: str) -> str: tokens = text.split(" ") for i in range(len(tokens)): candidates = self.get_candidates(tokens[i]) if len(candidates) > 0 and flip_coin(random, self.p_hyphen): candidate = random.choice(candidates) tokens[i] = "-".join(candidate) return " ".join(tokens)
def __init__(self, text, abbr=Abbreviations(), hyphen=Hyphenator('en_US'), easy=EasyWords()): text = punct_clean(text, abbr) self.sent_count = sent_count(text, abbr, True) self.char_count = char_count(text, abbr, True) text = word_array(text, abbr, True) self.word_count = word_count(text, abbr, True) self.notdalechall_count = notdalechall_count(text, abbr, easy, True) sybl_list = sybl_counts(text, abbr, hyphen, True) self.sybl_count = sybl_list['sybl_count'] self.polysyblword_count = sybl_list['polysyblword_count'] self.counts = { 'char_count': self.char_count, 'word_count': self.word_count, 'sent_count': self.sent_count, 'sybl_count': self.sybl_count, 'notdalechall_count': self.notdalechall_count, 'polysyblword_count': self.polysyblword_count } self.flesch_score = flesch_score(vars=self.counts) self.fleschkincaid_score = fleschkincaid_score(vars=self.counts) self.gunningfog_score = gunningfog_score(vars=self.counts) self.smog_score = smog_score(vars=self.counts) self.dalechall_score = dalechall_score(vars=self.counts) self.scores = { 'flesch_score': self.flesch_score, 'fleschkincaid_score': self.fleschkincaid_score, 'gunningfog_score': self.gunningfog_score, 'smog_score': self.smog_score, 'dalechall_score': self.dalechall_score }
def __init__(self, text, abbr=Abbreviations(), hyphen=Hyphenator('en_US'), easy=EasyWords()): text = punct_clean(text, abbr) self.sent_count = sent_count(text, abbr, True) self.char_count = char_count(text, abbr, True) text = word_array(text, abbr, True) self.word_count = word_count(text, abbr, True) self.notdalechall_count = notdalechall_count(text, abbr, easy, True) sybl_list = sybl_counts(text, abbr, hyphen, True) self.sybl_count = sybl_list['sybl_count'] self.polysyblword_count = sybl_list['polysyblword_count'] self.counts = { 'char_count': self.char_count, 'word_count': self.word_count, 'sent_count': self.sent_count, 'sybl_count': self.sybl_count, 'notdalechall_count': self.notdalechall_count, 'polysyblword_count': self.polysyblword_count }
class HyphenatorAlgorithm(object): """ This is a small wrapper on the Hyphenator method from our Hyphen import. Conforms to the same return type as the HyphenatorDictionary class """ def __init__(self): """ Initialize the class """ self._hyphenator = Hyphenator('en_US') def syllables(self, word): """ Calculates the number of syllables, if it tries to return 0 it returns 1. All words should count as a syllable """ syll = self._hyphenator.syllables(unicode(word)) length = len(syll) if length != 0: return length else: return 1
def __init__(self): """ Initialize the class """ self._hyphenator = Hyphenator('en_US')
from hyphen import Hyphenator from hyphen.dictools import * import sys for lang in ['es_MX', 'es_SP']: if not is_installed(lang): install(lang) h_mx = Hyphenator('es_MX') for word in sys.argv[1:]: print h_mx.syllables(unicode(word.encode('utf-8')))
def poem_generate(num_pairs): print "We are doing the 2rd order Markov model!" print "Number of poems to generate:", num_pairs # how many pairs to generate ending_words_dict = sample_ending_word(num_pairs) poems_dict = dict() h_en = Hyphenator('en_US') prondict = nltk.corpus.cmudict.dict() for ind in ['A','B','C','D','E','F','G']: print "Group:", ind # get ending words ending_words = ending_words_dict[ind] # preprocess data corpusname = '../data/grouping2/group' + ind + '.txt' corpus = importasline(corpusname, ignorehyphen=False) vectorizer = CountVectorizer(min_df=1) X = vectorizer.fit_transform(corpus) analyze = vectorizer.build_analyzer() Y = [[vectorizer.vocabulary_[x] for x in analyze(corpus[i])] for i in range(len(corpus))] ending_tokens = [[vectorizer.vocabulary_[x] for x in ending_words[i]] for i in range(len(ending_words))] # print(Y) words = vectorizer.get_feature_names() print "Number of words:", len(words) # train in a reverse direction for i, line in enumerate(Y): Y[i] = line[::-1] # print(Y) # generate number of syllables for every word words_num_syllables = np.zeros(len(words), dtype=int) for wordid, word in enumerate(words): try: phon = prondict[word][0] words_num_syllables[wordid] = sum(map(hasNumbers, phon)) except: words_num_syllables[wordid] = len(h_en.syllables(unicode(word))) if not words_num_syllables[wordid]: words_num_syllables[wordid] = count_syllables(word) # train model modelname = 'model2rdMMgroup' + ind hmm = Markov( len(words), Y, words_num_syllables, modelname) print(len(hmm.inversetable)) # generate poems subpoems = [None]*num_pairs for pairid in range(num_pairs): start_token = ending_tokens[pairid] robotpoem0 = '' line0,linew0 = hmm.generating_random_line_end(start_token[0]) for j in linew0[-2::-1]: robotpoem0+=' '+words[j]+' ' print(robotpoem0) robotpoem1 = '' line1,linew1 = hmm.generating_random_line_end(start_token[1]) for j in linew1[-2::-1]: robotpoem1+=' '+words[j]+' ' print(robotpoem1) subpoems[pairid] = (robotpoem0, robotpoem1) # add the best subpoem to poems_dict poems_dict[ind] = subpoems # write down the poems poem_file_name = '../poems2rdMM/reverse_with_punctuations.txt' fwrite = open(poem_file_name, 'w') for poemid in range(num_pairs): # construct poems robotpoem = [None]*14 robotpoem[0] = poems_dict['A'][poemid][0] robotpoem[2] = poems_dict['A'][poemid][1] robotpoem[1] = poems_dict['B'][poemid][0] robotpoem[3] = poems_dict['B'][poemid][1] robotpoem[4] = poems_dict['C'][poemid][0] robotpoem[6] = poems_dict['C'][poemid][1] robotpoem[5] = poems_dict['D'][poemid][0] robotpoem[7] = poems_dict['D'][poemid][1] robotpoem[8] = poems_dict['E'][poemid][0] robotpoem[10] = poems_dict['E'][poemid][1] robotpoem[9] = poems_dict['F'][poemid][0] robotpoem[11] = poems_dict['F'][poemid][1] robotpoem[12] = poems_dict['G'][poemid][0] robotpoem[13] = poems_dict['G'][poemid][1] robotpoem = Format(robotpoem) # write into file print>>fwrite, str(poemid) for lineid in range(14): print>>fwrite, robotpoem[lineid] fwrite.close()
def getSentenceSyllCount(sentence): h = Hyphenator('en_GB') count = 0 for word in sentence: count += max(len(h.syllables(unicode(word))), 1) return count
def __init__(self): self.hyphen = Hyphenator('en_US') pass
class checkTweet(): def __init__(self, text = 'Defualt Tweet'): # only keep latin chars: self.rawText = re.sub(ur'[^\x00-\x7F]', u'', text) self.textWords = self.rawText.split() self.h_en = Hyphenator('en_US') self.badSymbols = ['http:', 'https:', '&'] self.forbiddenThings = ['@'] # random syms self.forbiddenWords = ['el', 'la', 'en', 'tu', # spanish 'Et', 'le', 'aux', 'les', 'de', 'des', 'du', 'il', 'Elle', 'ses', 'sa', 'ces', 'cela', 'est', 'vous', 'tous', 'nous', 'allez', 'alons'] # french self.forbiddenEnds = ['the', 'and', 'a', 'an', 'for', 'at', 'except', 'or', 'has', 'my', 'your', 'their', 'his', 'hers', 'her\'s', 'get', 'it\'ll', 'to', 'like', 'is', 'I'] def qualityControl(self): self.replaceText() self.remove_at_symbol_first() self.remove_symbolWords() if self.check_forbiddenThings(): return False print "post QC tweet: ", " ".join(self.textWords) return True def replaceText(self): self.textWords = [w.replace('#', 'hashtag ') for w in self.textWords] def remove_at_symbol_first(self): if re.search('RT', self.textWords[0]): del self.textWords[0] if re.search('@', self.textWords[0]): del self.textWords[0] def remove_symbolWords(self): # remove words with badSymbols for i, word in enumerate(self.textWords): for s in self.badSymbols: if re.search(s, word): del self.textWords[i] break def words_no_vowels(self, wordList): for word in wordList: if not re.search("([aeiouyAEIOUY]+)", word): print word, ' - did not contain any vowels' return True return False def check_forbiddenThings(self): for s in self.forbiddenThings: if any([re.search(s, word) for word in self.textWords]): print 'the forbidden thing: ', s, ' was found' return True for s in self.forbiddenWords: if any([re.search('^'+s+'$', word, re.IGNORECASE) for word in self.textWords]): print 'the forbidden word: ', s, ' was found' return True return False def checkSylbls(self, Nsyls): finalWords = self.confirmSylsCounts(Nsyls) if not finalWords or self.words_no_vowels(finalWords) \ or any(finalWords[-1] == s for s in self.forbiddenEnds): return list() print Nsyls, "syls found... final text: ", finalWords return finalWords def confirmSylsCounts(self, Nsyls): nWords = len(self.textWords) i = 0 sylsCount = 0; tooHard = False; # loop until the end of the word list, we count Nsyls or can't figure out a word while i < nWords and sylsCount < Nsyls and not tooHard: if len(self.textWords[i]) >= 100: #hyphenator will break and something is crazy return list() libreSyls = len(self.h_en.syllables(self.textWords[i])) libreSyls = max(libreSyls, 1) simplSyls = self.count_syllables(self.textWords[i]) if libreSyls == simplSyls[0] or libreSyls == simplSyls[1]: sylsCount = sylsCount + libreSyls elif simplSyls[0] == simplSyls[1]: sylsCount = sylsCount + simplSyls[1] else: # this tweet is too hard tooHard = True i += 1 if (sylsCount == Nsyls) and not tooHard: return self.textWords[:i] else: return list() def count_syllables(self, word): if not word: return 0, 0 vowels = ['a', 'e', 'i', 'o', 'u'] on_vowel = False in_diphthong = False minsyl = 0 maxsyl = 0 lastchar = None word = word.lower() for c in word: is_vowel = c in vowels if on_vowel == None: on_vowel = is_vowel # y is a special case if c == 'y': is_vowel = not on_vowel if is_vowel: if not on_vowel: # We weren't on a vowel before. # Seeing a new vowel bumps the syllable count. minsyl += 1 maxsyl += 1 elif on_vowel and not in_diphthong and c != lastchar: # We were already in a vowel. # Don't increment anything except the max count, # and only do that once per diphthong. in_diphthong = True maxsyl += 1 on_vowel = is_vowel lastchar = c # Some special cases: if word[-1] == 'e': minsyl -= 1 # if it ended with a consonant followed by y, count that as a syllable. if word[-1] == 'y' and not on_vowel: maxsyl += 1 return minsyl, maxsyl
import sys import json """ 2.7 and up version is capitalized (annoying) """ if sys.version_info >= (2, 7): from hyphen import Hyphenator, dictools hy = Hyphenator('en_US') else: from hyphen import hyphenator, dictools hy = hyphenator('en_US') try: json_object = {} for word in sys.argv[1:]: json_object[word] = hy.syllables(unicode(word)) print json.dumps(json_object) except IndexError: sys.exit(1) sys.exit(0)
def count_syllables(word): hyphenator = Hyphenator('en_US') return max(len(hyphenator.syllables(word)),1)
def poem_generate(num_of_hidden_states, num_pairs): print "Number of hidden states:", num_of_hidden_states print "Number of poems to generate:", num_pairs # how many pairs to generate ending_words_dict = sample_ending_word(num_pairs) poems_dict = dict() h_en = Hyphenator('en_US') prondict = nltk.corpus.cmudict.dict() prob_file_name = '../probability/prob_num'+str(num_of_hidden_states)+'.txt' fwrite = open(prob_file_name, 'w') ### for ind in ['A','B','C','D','E','F','G']: ### for ind in ['A']: print "Group:", ind # get ending words ending_words = ending_words_dict[ind] # preprocess data corpusname = '../data/grouping1/group' + ind + '.txt' corpus = importasline(corpusname, ignorehyphen=False) vectorizer = CountVectorizer(min_df=1) X = vectorizer.fit_transform(corpus) analyze = vectorizer.build_analyzer() Y = [[vectorizer.vocabulary_[x] for x in analyze(corpus[i])] for i in range(len(corpus))] ending_tokens = [[vectorizer.vocabulary_[x] for x in ending_words[i]] for i in range(len(ending_words))] # print(Y) words = vectorizer.get_feature_names() print "Number of words:", len(words) # train in a reverse direction for i, line in enumerate(Y): Y[i] = line[::-1] # print(Y) # generate number of syllables for every word words_num_syllables = np.zeros(len(words), dtype=int) for wordid, word in enumerate(words): try: phon = prondict[word][0] words_num_syllables[wordid] = sum(map(hasNumbers, phon)) except: words_num_syllables[wordid] = len(h_en.syllables(unicode(word))) if not words_num_syllables[wordid]: words_num_syllables[wordid] = count_syllables(word) # train model ntrial = 10 logp = np.zeros(ntrial) # logp is an 1-D array subpoems = [None]*num_pairs for i in range(ntrial): modelname = 'modelnhiddengroup'+ind+'_'+str(num_of_hidden_states)+'_trial'+str(i) hmm = modelhmm(num_of_hidden_states, len(words), Y, words_num_syllables, modelname) logp[i] = hmm.trainHHM(Y) if (i==0) or (i>0 and logp[i] > max(logp[0:i])): hmm.savemodel() hmm.loadmodel() # generate poems for pairid in range(num_pairs): start_token = ending_tokens[pairid] robotpoem0 = '' line0,linew0 = hmm.generating_random_line_end(start_token[0]) for j in linew0[::-1]: robotpoem0+=' '+words[j]+' ' print robotpoem0, 'robotpoem0' robotpoem1 = '' line1,linew1 = hmm.generating_random_line_end(start_token[1]) for j in linew1[::-1]: robotpoem1+=' '+words[j]+' ' print(robotpoem1) subpoems[pairid] = (robotpoem0, robotpoem1) hmm.analyzing_word(words) # add the best subpoem to poems_dict poems_dict[ind] = subpoems print>>fwrite, ind print>>fwrite, str(logp) print "List of log probability:", logp fwrite.close() # write down the poems poem_file_name = '../poems_counting/reverse_'+str(num_of_hidden_states)+'.txt' fwrite = open(poem_file_name, 'w') for poemid in range(num_pairs): # construct poems robotpoem = [None]*14 robotpoem[0] = poems_dict['A'][poemid][0] robotpoem[2] = poems_dict['A'][poemid][1] robotpoem[1] = poems_dict['B'][poemid][0] robotpoem[3] = poems_dict['B'][poemid][1] robotpoem[4] = poems_dict['C'][poemid][0] robotpoem[6] = poems_dict['C'][poemid][1] robotpoem[5] = poems_dict['D'][poemid][0] robotpoem[7] = poems_dict['D'][poemid][1] robotpoem[8] = poems_dict['E'][poemid][0] robotpoem[10] = poems_dict['E'][poemid][1] robotpoem[9] = poems_dict['F'][poemid][0] robotpoem[11] = poems_dict['F'][poemid][1] robotpoem[12] = poems_dict['G'][poemid][0] robotpoem[13] = poems_dict['G'][poemid][1] robotpoem = Format(robotpoem) # write into file print>>fwrite, str(poemid) for lineid in range(len(robotpoem)): print>>fwrite, robotpoem[lineid] fwrite.close()
# pitch_array = np.array(pitch_info[1]) # print np.mean(pitch_array) # print np.std(pitch_array) # sent_dict = build_sentiment_dict(sentiment_dict_path) # # for key in sent_dict: # print key # # ts = get_time_stamp(bml_path) # # for t in ts: # print t # words = re.split('\W+', 'OK, well, shall we start? Welcome to Finnmore Associates!') # words.remove('') # for ind, word in enumerate(words): # print word, ind h_en = Hyphenator('en_US') # print len(h_en.syllables(unicode(u'beautiful'))) # # alist = [1, 2, 4, 2, 5, 6, 9.1] # barray = np.array(alist) # dev = np.std(barray) # m = np.mean(barray) # print m # print dev
def main(): # Load Shakespeare dataset. sonnets = util.loadShakespeareSonnets() tokens = util.getUniqueSyllables(sonnets) numObs = len(tokens) numStates = 4 model = hmm_end_state.HMM(numStates, numObs) # Train model on tokenized dataset. h = Hyphenator('en_GB') words = [] for sonnet in sonnets: for sentence in sonnet: for word in sentence: tokenizedWord = [] syllables = h.syllables(unicode(word.lower())) if len(syllables) < 2: tokenizedWord.append(tokens.index(unicode(word.lower()))) else: for syllable in syllables: tokenizedWord.append(tokens.index(syllable)) words.append(tokenizedWord) model.train(words, maxIter=4) # Generate artificial sonnet with any generated words and detokenize it. artificialSonnet = model.generateSonnetFromSyllables(numSentences=14, numWordsPerSentence=8) detokenizedSonnet = [] for sentence in artificialSonnet: detokenizedSentence = [] for w, word in enumerate(sentence): detokenizedWord = '' if w == 0: syll = word[0] detokenizedWord += tokens[syll][0].upper() + tokens[syll][1:] for syll in word[1:]: detokenizedWord += tokens[syll] else: for syll in word: detokenizedWord += tokens[syll] detokenizedSentence.append(detokenizedWord) detokenizedSonnet.append(detokenizedSentence) # Write detokenized sonnet to text file. util.writeSonnetToTxt(detokenizedSonnet) # Generate artificial sonnet with only valid words and detokenize it. artificialSonnet = model.generateSonnetFromSyllables( numSentences=14, numWordsPerSentence=8, validWords=util.getUniqueWords(sonnets), tokens=tokens) detokenizedSonnet = [] for sentence in artificialSonnet: detokenizedSentence = [] for w, word in enumerate(sentence): detokenizedWord = '' if w == 0: syll = word[0] detokenizedWord += tokens[syll][0].upper() + tokens[syll][1:] for syll in word[1:]: detokenizedWord += tokens[syll] else: for syll in word: detokenizedWord += tokens[syll] detokenizedSentence.append(detokenizedWord) detokenizedSonnet.append(detokenizedSentence) # Write detokenized sonnet to text file. util.writeSonnetToTxt(detokenizedSonnet)
def split_lyrics_to_syllables(selected_song, user_lyrics): """ The lyrics text in original music scores are split into multiple syllables and each syllable will be paired with 1 or more key/beat in the song. For example, in the "Happy Birthday" song, the word "happy" has been split into "hap" and "py" and each syllable corresponds to one beat in the song. Hence, we need to split the user lyrics into multiple syllables as well. This function utilizes a Hyphenator to split the user's lyrics into several syllables until the syllables can fit into the modifiable region of the song music score. i.e. the number of syllables from split user lyrics should be equal to the number of syllables in the modifiable region of music score. The modifiable region of each song has already been defined in song_details.json, and can be obtained through the argument selected_song. Arguments: selected_song - A JSON object representing the song selected by the user. This object includes information such as the song music score file path, original song lyrics and the position of the modifiable region of the music score. The JSON object is retrieved from api/static/song_details.json. user_lyrics - A string which is the lyrics text that will replace the orginal lyrics in the modifiable portion of the song music score Exceptions raised: ValueError - Raised when the song language is not English or Spanish RuntimeError - Raised when the split user lyrics cannot fit into the song modifiable region Return: split_user_lyrics - A list of strings, where the length of the list is equal to the length of modifiable region in the music score, and each string in the list will replace one syllable in the modifiable region of the song """ # retrieve the position of modifiable lyrics region in the music score & the song language start_edit_pos, end_edit_pos, song_language = selected_song[ "startEditPos"], selected_song["endEditPos"], selected_song["language"] # determine the total number of syllables that can be modified in the music score file xml_edit_num = end_edit_pos - start_edit_pos + 1 # create Hyphenator object based on song language if song_language == "en_US": h = Hyphenator('en_US') elif song_language == "es": h = Hyphenator('es') else: raise ValueError( "Song language not supported, currently only support English and Spanish." ) split_user_lyrics = [] # split the user's lyrics sentence into a list of words user_lyrics_words = user_lyrics.split() # split each word into their corresponding syllables user_lyrics_syllables = [] for word in user_lyrics_words: syllable = h.syllables(word) if syllable != []: user_lyrics_syllables += syllable else: # handle the case of single-syllable word user_lyrics_syllables.append(word) syllable_fitting_ratio = xml_edit_num / len(user_lyrics_syllables) if syllable_fitting_ratio == 1: # split user lyrics syllables fit perfectly into the modifiable area split_user_lyrics = user_lyrics_syllables elif syllable_fitting_ratio > 1: # split user lyrics syllables can fit into modifiable area but has too few syllables while len(user_lyrics_syllables) < xml_edit_num: user_lyrics_syllables.append("") split_user_lyrics = user_lyrics_syllables else: # split user lyrics syllables is more than the number of syllables requried in the modifiable area # need to re-split the word word_fitting_ratio = xml_edit_num / len(user_lyrics_words) if word_fitting_ratio == 1: # cases where number of words in user lyrics can fit into the music score modifiable area split_user_lyrics = user_lyrics_words elif word_fitting_ratio > 1: # cases where number of words can fit into the modificable area, but has too few words while len(user_lyrics_words) < xml_edit_num: user_lyrics_words.append("") split_user_lyrics = user_lyrics_words else: # cases where number of words in user lyrics cannot fit into the music score modifiable area # repetitively combine first two words into one, until word_fitting_ratio becomes 1 (i.e. until user lyrics word can fit into the modifiable area) while word_fitting_ratio != 1 and len(user_lyrics_words) > 1: user_lyrics_words[0:2] = [''.join(user_lyrics_words[0:2])] word_fitting_ratio = xml_edit_num / len(user_lyrics_words) split_user_lyrics = user_lyrics_words if len(split_user_lyrics) == xml_edit_num: return split_user_lyrics else: raise RuntimeError( 'Fail to fit user lyrics into the song modifiable region')
class Paragraph(Text): def __init__(self, text, margin, indent, lang="en_US", hyphen_char="\u2010"): Text.__init__(self, text, lang) self.margin = margin self.indent = indent if self.lang in dict_info.keys(): self.hyphenator = Hyphenator(self.lang) else: self.hyphenator = None self.hyphen_char = hyphen_char self.header.c = " " * self.indent def justify(self): e = self.header while e.next: # we go to the element which may have followed by consecutive elements but they together fit margin space while e.next and e.next.very_end() <= self.margin: e = e.next # now we go to the last break point before margin e = e.next_break() remaining_space = self.margin - e.end() if e.next: hyphen_next_word = self.hyphenator.wrap(e.next.c, remaining_space, hyphen=self.hyphen_char) if hyphen_next_word: # we save the next element after the hyphenated word element_next_after = e.next.next # we replace the hyphenated word with its first part, e.next = Word(hyphen_next_word[0][: -len(self.hyphen_char)]) e.next.link(e) # add a hyphen char e.next.next = Punctation(self.hyphen_char) e.next.next.link(e.next) # insert a newline e.next.next.next = Break("\n") e.next.next.next.link(e.next.next) # we also save this for complete justification later j = e.next.next.next # and put the second part of hyphenated word to the beginning of new line e.next.next.next.next = Word(hyphen_next_word[1]) e.next.next.next.next.link(e.next.next.next) # and link the saved element to it element_next_after.link(e.next.next.next.next) e = element_next_after else: e.newlineize() # we also save this for complete justification later j = e # we also go to next e for our while loop e = e.next # now we try to fill our line with whitespaces ... # ... for this we collect whitespace elements of the current line spaces = [] c_e = j while c_e.prev != j.line_start(): if isinstance(c_e, Break) and c_e.space(): spaces.append(c_e) c_e = c_e.prev # and increase the size of them until line is filled while j.start() < self.margin: # get the minimum length of spaces minimum_length_space = 1000 for i in spaces: if i.length() < minimum_length_space: minimum_length_space = i.length() # now we get nice kind of spaces minimum_spaces = set() priority_spaces = set() for i in spaces: if i.length() == minimum_length_space: minimum_spaces.add(i) if i.next_to_punctation(): priority_spaces.add(i) # let's try to get one randomly from the intersection of the two if len(minimum_spaces & priority_spaces) > 0: b = random.sample(minimum_spaces & priority_spaces, 1)[0] else: b = random.sample(minimum_spaces, 1)[0] # now increase b length b.c += " "