Exemple #1
0
class Word(BasicText):
    """Represents a Word."""

    if "hyphen" in sys.modules:
        h_en = Hyphenator('en_US')

    def __init__(self, text):
        """Initializes a Word."""
        self.text = text

    @BasicText.text.setter
    def text(self, new_text):
        self._text = new_text.strip(""" (),.?!;:\"\'""")

    def count_syllables(self):
        """ Counts the number of syllables for an English language Word.  """
        try:
            n_syllables = len(Word.h_en.syllables(self.text))
            if n_syllables > 0:
                return n_syllables
        except ValueError:
            # Thrown by syllables function for words longer than 100 characters long.
            return 30
        return 1

    def is_adverb(self):
        """Determines whether word is an adverb."""
        return re.match(r"\w+ly", self.text)
Exemple #2
0
def build_sentence_info(timestamps, sentence, sent_dict):
    '''
    Build sentence info from timestamps, sentence text and sentiment lexicon
    :param timestamps:
    :param sentence:
    :param sent_dict:
    :return:
    '''
    # for test
    # print sentence

    h_en = Hyphenator('en_US')
    info_list = []
    # words = re.split('\W+', sentence)
    words = re.split('[,.!?\r\n ]+', sentence)
    # print words
    # print len(words)
    # print len(timestamps)
    words.remove('')
    words_with_punct = sentence.split()

    for ind, word in enumerate(words):
        if word in sent_dict:
            c_sentiment = sent_dict[word]
        else:
            c_sentiment = 0
        punct = ''
        if words_with_punct[ind] != word:
            punct = words_with_punct[ind][-1]
        num = t2n.text2num(word)
        info_list.append(
            (word, timestamps[ind * 2], timestamps[ind * 2 + 1],
             len(h_en.syllables(unicode(word))), c_sentiment, punct, num))
    return info_list
Exemple #3
0
def syllablize(poem):
    # syllablizer setup
    if not is_installed(language): install(language)
    hyph = Hyphenator(language)

    # output dict to send back through API
    output = []

    for line in poem:
        # list of words in line
        words = line.split()
        syllablized_line = []

        for word in words:
            syls = hyph.syllables(word)

            new_word = ""

            if len(syls) == 0:
                new_word = word
            else:
                for syl in syls:
                    new_word += syl
                    new_word += " "

            syllablized_line.append(new_word.strip())

        if len(syllablized_line) > 0:
            output.append(syllablized_line)

    return output
def main():
    hyphenator = Hyphenator('en_GB')

    with open(filename, 'r', encoding='utf-8') as f:
        chapters = parse_chapters(f)

    with open(filename, 'w', encoding='utf-8', newline='\n') as f:
        for chapter_idx, (chapter_name, entries, head_eager_code,
                          tail_eager_code) in enumerate(chapters):
            print(chapter_name)

            f.write(f'@<|\n{head_eager_code}\n|>\n')

            for entry_idx, (code, chara_name, dialogue) in enumerate(entries):
                if code:
                    f.write(f'<|\n{code}\n|>\n')
                if dialogue:
                    dialogue = normalize(dialogue)
                    dialogue = add_soft_hyphens(hyphenator, dialogue)
                    dialogue = add_nbsp(dialogue)
                    if chara_name:
                        f.write(f'{chara_name}::{dialogue}\n')
                    else:
                        f.write(dialogue + '\n')
                    if entry_idx < len(entries) - 1:
                        f.write('\n')

            if tail_eager_code:
                f.write(f'@<|\n{tail_eager_code}\n|>\n')
            else:
                f.write('@<||>\n')
            if chapter_idx < len(chapters) - 1:
                f.write('\n')
Exemple #5
0
def make_dicts(filename):

    syllables = {}

    with open(filename) as theFile:
        f = theFile.read()
        f = f.lower()
        f = f.replace('\r', '\n')
        lines = f.split("\n")

        for line in lines:
            if line != "":
                line = line.split('\\')

                if line[1] not in syllables and len(line[1].split()) == 1:
                    syllables[line[1]] = line[-1].split("-")

    for lang in ['en_US']:
        if not is_installed(lang):
            install(lang)

    #other dict
    h_en = Hyphenator('en_US')

    return (syllables, h_en)
Exemple #6
0
def syllabizeNames(nameList):
    tempList = []
    for lang in ['en_US']:
        if not is_installed(lang): install(lang)
    en_US = Hyphenator('en_US')
    for item in nameList:
        tempList.append(en_US.syllables(item))
    return tempList
Exemple #7
0
def main(arguments: List[str] = None):
    namespace = parser.parse_args(arguments)

    command = namespace.command

    if command == 'export_font':
        from .pdf import PDF
        glyphs = set(GLYPHS)
        cwd = pathlib.Path('.')
        if namespace.text is not None:
            for text_glob in namespace.text:
                for text_file in cwd.glob(text_glob):
                    print(f'Taking glyphs from:\n  {text_file}')
                    glyphs.update(set(text_file.read_text('utf-8')))
        font = PDF.font(namespace.font_name,
                        namespace.font_size,
                        glyphs=glyphs)
        font.export(namespace.output)
    elif command == 'tester':
        from .tester import main
        main(namespace)
    elif command == 'hyphenate':
        text = namespace.input.read().decode()
        hyphenator = Hyphenator(language=namespace.language)
        for token_type, text in tokenize(text):
            if token_type is TokenType.WORD:
                syllables = hyphenator.syllables(text) or [text]
                namespace.output.write_chunk('-'.join(syllables).encode())
            else:
                namespace.output.write_chunk(text.encode())
    elif command == 'render':
        import json

        from .printer import Page, FontSpec, Fragment
        from .pdf import PDF

        text = namespace.input.read()
        raw_pages = text.split('\0\n')

        pages = []

        for raw_page in raw_pages:
            if not raw_page:
                continue
            page_data = json.loads(raw_page)
            font_spec = FontSpec(page_data['font_spec']['name'],
                                 page_data['font_spec']['size'])
            paper_width = page_data['paper_width']
            paper_height = page_data['paper_height']
            fragments = [
                Fragment(**fragment) for fragment in page_data['fragments']
            ]
            page = Page(font_spec, paper_width, paper_height, fragments)
            pages.append(page)

        pdf = PDF(namespace.output)
        pdf.render(pages)
        pdf.finish()
Exemple #8
0
 def encode(self, word):
     num_string = ""
     h_mx = Hyphenator('es_MX')
     for syllable in h_mx.syllables(unicode(word)):
         for idx, pattern in enumerate(self.patterns):
             for regex in pattern:
                 if re.match(regex, syllable):
                     num_string += str(idx)
     return num_string
    def test_beautiful(self):
        h_en = Hyphenator('en_US')

        self.assertEqual([['beau', 'tiful'], [u'beauti', 'ful']],
                         h_en.pairs('beautiful'))

        self.assertEqual(['beau-', 'tiful'], h_en.wrap('beautiful', 6))

        self.assertEqual(['beauti-', 'ful'], h_en.wrap('beautiful', 7))

        self.assertEqual(['beau', 'ti', 'ful'], h_en.syllables('beautiful'))
def main(args):
    if args.quantize and args.device != "cpu":
        raise RuntimeError("Quantization only available on CPU devices")

    port = args.port or os.environ.get("PORT") or 8000
    for handler in logging.root.handlers[:]:
        logging.root.removeHandler(handler)

    lvl = logging.DEBUG if args.verbose else logging.INFO
    logging.basicConfig(level=lvl)

    word_generator = WordGenerator(
        device=args.device,
        forward_model_path=args.forward_model_path,
        inverse_model_path=args.inverse_model_path,
        blacklist_path=args.blacklist_path,
        quantize=args.quantize,
        is_urban=False,
    )
    urban_generator = None
    if args.forward_urban_model_path:
        logging.info(f"Creating urban model from {args.forward_urban_model_path}")
        urban_generator = WordGenerator(
            device=args.device,
            forward_model_path=args.forward_urban_model_path,
            inverse_model_path=None,
            blacklist_path=args.blacklist_path,
            quantize=args.quantize,
            is_urban=True,
        )

    h_en = Hyphenator('en_US')

    logging.info(f"Warming up with word generation")
    gen_word = word_generator.generate_word()
    logging.info(f"Generated {gen_word}")

    server = grpc.server(futures.ThreadPoolExecutor(max_workers=10))
    wordservice_pb2_grpc.add_WordServiceServicer_to_server(
        WordServiceServicer(word_generator, h_en, urban_generator=urban_generator), 
        server
    )
    server.add_insecure_port("[::]:{}".format(port))
    server.start()

    logging.info(f"Listening on port {port}")

    try:
        while True:
            time.sleep(3600 * 24)
    except KeyboardInterrupt:
        server.stop(args.shutdown_grace_duration)
Exemple #11
0
def sybl_counts(text, abbr=Abbreviations(), hyphen=Hyphenator('en_US'), prepped=False):
    """Count number of syllables in text, return in sybl_count;
    count number of words with three or more syllables, return
    in polysyblword_count.
    """
    if not prepped:
        text = word_array(text, abbr)
    sybl_count = 0
    polysyblword_count = 0
    for word in text:
        syblperword_c = max(1, len(hyphen.syllables(word)))
        sybl_count += syblperword_c
        if syblperword_c >= 3: polysyblword_count += 1
    return {'sybl_count': sybl_count, 'polysyblword_count': polysyblword_count}
Exemple #12
0
def smog_score(text=None, abbr=None, hyphen=None, vars={}):
    """Calculate SMOG score."""
    if text:
        if not abbr:
            abbr = Abbreviations()
        if not hyphen:
            hyphen = Hyphenator('en_US')
        text = punct_clean(text, abbr)
        vars['sent_count'] = sent_count(text, abbr, True)
        text = word_array(text, abbr, True)
        vars['polysyblword_count'] = sybl_counts(text, abbr, hyphen,
                                                 True)['polysyblword_count']
    return 3.1291 + 1.0430 * sqrt(
        30 * (vars['polysyblword_count'] / float(vars['sent_count'])))
Exemple #13
0
def tokenize_word_to_syllables(word, lang):
    global hyphenator
    if hyphenator is None:
        print('Initializing Hyphenator (' + lang + ')...')
        hyphenator = Hyphenator(lang)

    syllables = hyphenator.syllables(word)

    # Word with only one syllable need special treatment,
    # because the hyphenator does not recognize them
    if len(syllables) == 0:
        syllables = [word]

    return syllables
Exemple #14
0
def hyphenate(value, arg=None, autoescape=None):
    if autoescape:
        esc = conditional_escape
    else:
        esc = lambda x: x
    
    minlen = 7

    if arg:
        args = arg.split(u',')
        code = args[0]
        if len(args) > 1:
            minlen = int(args[1])
    else:
        code = settings.LANGUAGE_CODE

    #
    # Looks like this is assuming that the language code will arrive as 'xx-
    # YY'. In our case, it will arrive as simply 'en', so we MUST expand this
    # into a locale in order to work with PyHyphen.
    #

    # TODO: This should probably be a lookup against a dict in settings?

    s = code.split(u'-')

    if len(s) == 1:
        if s[0] == 'en':
            s.append(u'US')
        elif s[0] == 'bg':
            s.append(u'BG')

    lang = s[0].lower() + u'_' + s[1].upper()
    
    if not dictools.is_installed(lang): 
        dictools.install(lang)
        
    h = Hyphenator(lang)

    new = []

    for word in value.split(u' '):
        if len(word) > minlen and word.isalpha():
            new.append(u'&shy;'.join(h.syllables(word)))
        else:
            new.append(word)
    
    result = u' '.join(new)
    return mark_safe(result)
Exemple #15
0
def gunningfog_score(text=None, abbr=None, hyphen=None, vars={}):
    """Calculate Gunning Fog score."""
    if text:
        if not abbr:
            abbr = Abbreviations()
        if not hyphen:
            hyphen = Hyphenator('en_US')
        text = punct_clean(text, abbr)
        vars['sent_count'] = sent_count(text, abbr, True)
        text = word_array(text, abbr, True)
        vars['word_count'] = word_count(text, abbr, True)
        vars['polysyblword_count'] = sybl_counts(text, abbr, hyphen,
                                                 True)['polysyblword_count']
    return 0.4 * ((vars['word_count'] / float(vars['sent_count'])) + 100 *
                  (vars['polysyblword_count'] / float(vars['word_count'])))
Exemple #16
0
def fleschkincaid_score(text=None, abbr=None, hyphen=None, vars={}):
    """Calculate Flesch-Kincaid score."""
    if text:
        if not abbr:
            abbr = Abbreviations()
        if not hyphen:
            hyphen = Hyphenator('en_US')
        text = punct_clean(text, abbr)
        vars['sent_count'] = sent_count(text, abbr, True)
        text = word_array(text, abbr, True)
        vars['word_count'] = word_count(text, abbr, True)
        vars['sybl_count'] = sybl_counts(text, abbr, hyphen,
                                         True)['sybl_count']
    return -15.59 + 0.39 * (vars['word_count'] / vars['sent_count']) + 11.8 * (
        vars['sybl_count'] / vars['word_count'])
 def _set_lang_dict(self):
     if self.dict_download:
         try:
             if not is_installed(self.lang_code):
                 if self.verbose:
                     print(Msg.DICT_INSTALL(self.lang_code))
                 install(self.lang_code)
             self.lang_dict = Hyphenator(self.lang_code)
         except:
             pass
         if self.verbose:
             if is_installed(self.lang_code):
                 print(Msg.DICT_INSTALLED(self.lang_code))
             else:
                 print(Msg.DICT_INSTALL_FAILED(self.lang_code))
Exemple #18
0
def flesch_score(text=None, abbr=None, hyphen=None, vars={}):
    """Calculate Flesch Reading Ease score."""
    if text:
        if not abbr:
            abbr = Abbreviations()
        if not hyphen:
            hyphen = Hyphenator('en_US')
        text = punct_clean(text, abbr)
        vars['sent_count'] = sent_count(text, abbr, True)
        text = word_array(text, abbr, True)
        vars['word_count'] = word_count(text, abbr, True)
        vars['sybl_count'] = sybl_counts(text, abbr, hyphen,
                                         True)['sybl_count']
    return 206.835 - 1.015 * (vars['word_count'] / float(
        vars['sent_count'])) - 84.6 * (vars['sybl_count'] /
                                       float(vars['word_count']))
Exemple #19
0
def main():
    parser = argparse.ArgumentParser(
        description="Wrap text file to given width, with hyphenation"
    )
    parser.add_argument("-w", "--width", type=int, default=70, help="Maximum line width")
    parser.add_argument("-l", "--language", default="en_US", help="Text file locale")
    parser.add_argument("path", help="Text file path. Use '-' to read from standard input.")
    args = parser.parse_args()

    hyphenator = Hyphenator(args.language)
    if args.path == "-":
        for content in sys.stdin:
            for line in textwrap2.wrap(content, width=args.width, use_hyphenator=hyphenator):
                print(line)
    else:
        with open(args.path) as f:
            for line in textwrap2.wrap(f.read(), width=args.width, use_hyphenator=hyphenator):
                print(line)
Exemple #20
0
def syllablize(line):
    """
    take a line and split it into a list of syllables
    """
    hyph_en = Hyphenator('en_US')
    syll_list = []
    #get words separately + count hyphenated words as 2 words
    words = line.replace("-", " ").split()
    for word in words:
        #remove common punctuation
        word = word.replace(",", "").replace(":", "").replace(";", "")
        syllables = hyph_en.syllables(word)
        if not syllables:
            #pyhyphen sometimes returns 1 syllable words back to you,
            #but sometimes return an empty list... don't know why
            syll_list.append(word)
        for syll in syllables:
            syll_list.append(syll)
    return syll_list
 def word_phonic_dict_func(self):
     '''
     Output: Ordered dictionary
         Keys - word
         Value - phonetic representation of the key
     '''
     h_en = Hyphenator('en_US')
     for line in self.lyrics_tokenized:
         for word in line:
             if word not in self.arpabet_dict.keys():
                 try:
                     self.arpabet_dict.update(
                         {word: pr.phones_for_word(word)[0]})
                     temp = h_en.syllables(unicode(word))
                     if len(temp) > 0:
                         self.word_syl_dict.update({word: temp})
                     else:
                         self.word_syl_dict.update({word: [unicode(word)]})
                 except Exception as e:
                     print e
Exemple #22
0
    def sylTokenizer(text):
        words = wordTokenizer(text)

        if language == 'en':
            en = Hyphenator('en_US')
            syl_split = map(lambda x: en.syllables(x)
                            if (len(x) > 1 and len(en.syllables(x)) > 0)
                            else [x],
                            words)
            comb_syl_split = map(lambda x: ["".join(x[i:i + ngrams])
                                            for i in
                                            range(max(len(x) - ngrams + 1,
                                                      1))
                                            ], syl_split)
            return reduce(lambda x, y: x + y, comb_syl_split)
        elif language == 'te':
            te = Syllabifier()
            syl_split = map(lambda x: te.syllabify_te(x)
                            if (len(x) > 1 and len(te.syllabify_te(x)) > 0)
                            else [x],
                            words)
            comb_syl_split = map(lambda x: ["".join(x[i:i + ngrams])
                                            for i in
                                            range(max(len(x) - ngrams + 1,
                                                      1))
                                            ], syl_split)
            return reduce(lambda x, y: x + y, comb_syl_split)

        else:
            hi = Syllabifier()
            syl_split = map(lambda x: hi.syllabify_hi(x)
                            if (len(x) > 1 and len(hi.syllabify_hi(x)) > 0)
                            else [x],
                            words)
            comb_syl_split = map(lambda x: ["".join(x[i:i + ngrams])
                                            for i in
                                            range(max(len(x) - ngrams + 1,
                                                      1))
                                            ], syl_split)
            return reduce(lambda x, y: x + y, comb_syl_split)
Exemple #23
0
    def __init__(self,
                 text,
                 abbr=Abbreviations(),
                 hyphen=Hyphenator('en_US'),
                 easy=EasyWords()):

        text = punct_clean(text, abbr)
        self.sent_count = sent_count(text, abbr, True)
        self.char_count = char_count(text, abbr, True)

        text = word_array(text, abbr, True)
        self.word_count = word_count(text, abbr, True)
        self.notdalechall_count = notdalechall_count(text, abbr, easy, True)

        sybl_list = sybl_counts(text, abbr, hyphen, True)
        self.sybl_count = sybl_list['sybl_count']
        self.polysyblword_count = sybl_list['polysyblword_count']

        self.counts = {
            'char_count': self.char_count,
            'word_count': self.word_count,
            'sent_count': self.sent_count,
            'sybl_count': self.sybl_count,
            'notdalechall_count': self.notdalechall_count,
            'polysyblword_count': self.polysyblword_count
        }

        self.flesch_score = flesch_score(vars=self.counts)
        self.fleschkincaid_score = fleschkincaid_score(vars=self.counts)
        self.gunningfog_score = gunningfog_score(vars=self.counts)
        self.smog_score = smog_score(vars=self.counts)
        self.dalechall_score = dalechall_score(vars=self.counts)

        self.scores = {
            'flesch_score': self.flesch_score,
            'fleschkincaid_score': self.fleschkincaid_score,
            'gunningfog_score': self.gunningfog_score,
            'smog_score': self.smog_score,
            'dalechall_score': self.dalechall_score
        }
Exemple #24
0
 def __init__(self, text, abbr=Abbreviations(), hyphen=Hyphenator('en_US'), easy=EasyWords()):
         
     text = punct_clean(text, abbr)
     self.sent_count = sent_count(text, abbr, True)
     self.char_count = char_count(text, abbr, True)
     
     text = word_array(text, abbr, True)
     self.word_count = word_count(text, abbr, True)
     self.notdalechall_count = notdalechall_count(text, abbr, easy, True)
     
     sybl_list = sybl_counts(text, abbr, hyphen, True)
     self.sybl_count = sybl_list['sybl_count']
     self.polysyblword_count = sybl_list['polysyblword_count']
     
     self.counts = {
         'char_count': self.char_count,
         'word_count': self.word_count,
         'sent_count': self.sent_count,
         'sybl_count': self.sybl_count,
         'notdalechall_count': self.notdalechall_count,
         'polysyblword_count': self.polysyblword_count
     }
Exemple #25
0
def build_sentence_data(title, timestamps, sentence, sent_dict):
    '''
    Build sentence info from timestamps, sentence text and sentiment lexicon
    :param timestamps:
    :param sentence:
    :param sent_dict:
    :return: a SentenceData object contain text-based information about the sentence
    '''
    # for test
    # print sentence

    s = SentenceData(title, sentence)
    s.words = []

    h_en = Hyphenator('en_US')
    words = re.split('[,.!?\r\n ]+', sentence)

    words.remove('')
    words_with_punct = sentence.split()

    for ind, word in enumerate(words):
        if word in sent_dict:
            c_sentiment = sent_dict[word]
        else:
            c_sentiment = 0
        punct = ''
        if words_with_punct[ind] != word:
            punct = words_with_punct[ind][-1]
        num = t2n.text2num(word)
        if num == -1:
            num = ''
        else:
            num = str(num)
        w = WordData(word, float(timestamps[ind * 2]),
                     float(timestamps[ind * 2 + 1]), c_sentiment,
                     len(h_en.syllables(unicode(word))), punct, num)
        s.words.append(w)
    return s
Exemple #26
0
 def __init__(self, text='Defualt Tweet'):
     # only keep latin chars:
     self.rawText = re.sub(ur'[^\x00-\x7F]', u'', text)
     self.textWords = self.rawText.split()
     self.h_en = Hyphenator('en_US')
     self.badSymbols = ['http:', 'https:', '&']
     self.forbiddenThings = ['@']  # random syms
     self.forbiddenWords = [
         'el',
         'la',
         'en',
         'tu',  # spanish
         'Et',
         'le',
         'aux',
         'les',
         'de',
         'des',
         'du',
         'il',
         'Elle',
         'ses',
         'sa',
         'ces',
         'cela',
         'est',
         'vous',
         'tous',
         'nous',
         'allez',
         'alons'
     ]  # french
     self.forbiddenEnds = [
         'the', 'and', 'a', 'an', 'for', 'at', 'except', 'or', 'has', 'my',
         'your', 'their', 'his', 'hers', 'her\'s', 'get', 'it\'ll', 'to',
         'like', 'is', 'I'
     ]
Exemple #27
0
def split_lyrics_to_syllables(selected_song, user_lyrics):
    """
    The lyrics text in original music scores are split into multiple syllables and each syllable will be paired with 1 or more key/beat in the song. 
    For example, in the "Happy Birthday" song, the word "happy" has been split into "hap" and "py" and each syllable corresponds to one beat in the song.
    Hence, we need to split the user lyrics into multiple syllables as well.

    This function utilizes a Hyphenator to split the user's lyrics into several syllables until the syllables can fit into the modifiable region of the
    song music score. i.e. the number of syllables from split user lyrics should be equal to the number of syllables in the modifiable region of music score.
    The modifiable region of each song has already been defined in song_details.json, and can be obtained through the argument selected_song.

    Arguments:
    selected_song - A JSON object representing the song selected by the user. This object includes information such as the song music score file path,
                    original song lyrics and the position of the modifiable region of the music score. The JSON object is retrieved from api/static/song_details.json.
    user_lyrics   - A string which is the lyrics text that will replace the orginal lyrics in the modifiable portion of the song music score

    Exceptions raised:
    ValueError - Raised when the song language is not English or Spanish
    RuntimeError - Raised when the split user lyrics cannot fit into the song modifiable region
    
    Return:
    split_user_lyrics - A list of strings, where the length of the list is equal to the length of modifiable region in the music score, and each string in the list
                       will replace one syllable in the modifiable region of the song
    """
    # retrieve the position of modifiable lyrics region in the music score & the song language
    start_edit_pos, end_edit_pos, song_language = selected_song[
        "startEditPos"], selected_song["endEditPos"], selected_song["language"]

    # determine the total number of syllables that can be modified in the music score file
    xml_edit_num = end_edit_pos - start_edit_pos + 1

    # create Hyphenator object based on song language
    if song_language == "en_US":
        h = Hyphenator('en_US')
    elif song_language == "es":
        h = Hyphenator('es')
    else:
        raise ValueError(
            "Song language not supported, currently only support English and Spanish."
        )

    split_user_lyrics = []

    # split the user's lyrics sentence into a list of words
    user_lyrics_words = user_lyrics.split()

    # split each word into their corresponding syllables
    user_lyrics_syllables = []
    for word in user_lyrics_words:
        syllable = h.syllables(word)
        if syllable != []:
            user_lyrics_syllables += syllable
        else:
            # handle the case of single-syllable word
            user_lyrics_syllables.append(word)

    syllable_fitting_ratio = xml_edit_num / len(user_lyrics_syllables)

    if syllable_fitting_ratio == 1:
        # split user lyrics syllables fit perfectly into the modifiable area
        split_user_lyrics = user_lyrics_syllables
    elif syllable_fitting_ratio > 1:
        # split user lyrics syllables can fit into modifiable area but has too few syllables
        while len(user_lyrics_syllables) < xml_edit_num:
            user_lyrics_syllables.append("")
        split_user_lyrics = user_lyrics_syllables
    else:
        # split user lyrics syllables is more than the number of syllables requried in the modifiable area
        # need to re-split the word
        word_fitting_ratio = xml_edit_num / len(user_lyrics_words)

        if word_fitting_ratio == 1:
            # cases where number of words in user lyrics can fit into the music score modifiable area
            split_user_lyrics = user_lyrics_words
        elif word_fitting_ratio > 1:
            #  cases where number of words can fit into the modificable area, but has too few words
            while len(user_lyrics_words) < xml_edit_num:
                user_lyrics_words.append("")
            split_user_lyrics = user_lyrics_words
        else:
            # cases where number of words in user lyrics cannot fit into the music score modifiable area
            # repetitively combine first two words into one, until word_fitting_ratio becomes 1 (i.e. until user lyrics word can fit into the modifiable area)
            while word_fitting_ratio != 1 and len(user_lyrics_words) > 1:
                user_lyrics_words[0:2] = [''.join(user_lyrics_words[0:2])]
                word_fitting_ratio = xml_edit_num / len(user_lyrics_words)
            split_user_lyrics = user_lyrics_words

    if len(split_user_lyrics) == xml_edit_num:
        return split_user_lyrics
    else:
        raise RuntimeError(
            'Fail to fit user lyrics into the song modifiable region')
Exemple #28
0
from hyphen import Hyphenator
h_en = Hyphenator('en_US')

output = h_en.syllables('longer')
print(output)


def get_syllables(word):
    """
    using hypenator return syllables of an input word
    """
    syllables = h_en.syllables(word)
    if syllables == []:
        return [word]
    else:
        return syllables

def get_coloured_para(para):
    """
    for each word in a para
    get the sylleblyes of that word
    create a coloured version of the word
    patch these together into a new vibrant paragraph
    """
    coloured_para = []
    for word in para:
        colored_word = color_word(word)
        coloured_para.append(colored_word)
    return coloured_para

Exemple #29
0
import math
from hyphen import Hyphenator

import project
from src.helper.files import read_lines

from acl_cleaned_get_ocr_errors import get_ocr_errors
from acl_cleaned_analyse_ocr_errors import get_ocr_character_edits


if __name__ == "__main__":
    raw_file = sys.argv[1]
    clean_file = sys.argv[2]
    out_file = sys.argv[3] if len(sys.argv) > 3 else None

    hyphenator = Hyphenator()

    error_frequencies = {}

    for i, (corrupt, correct) in enumerate(zip(read_lines(raw_file), read_lines(clean_file))):
        print(f"** SEQUENCE {i} **")
        corrupt_tokens = corrupt.split()
        correct_tokens = correct.split()
        ocr_errors = get_ocr_errors(corrupt_tokens, correct_tokens)
        for corrupt, correct in ocr_errors:
            corrupt_parts = corrupt.split(" ")
            correct_parts = correct.split(" ")
            if len(corrupt_parts) != len(correct_parts):
                continue
            for corrupt_part, correct_part in zip(corrupt_parts, correct_parts):
                edits = get_ocr_character_edits(correct_part, corrupt_part)
Exemple #30
0
def hyphenate(xhtml: str,
              language: Optional[str],
              ignore_h_tags: bool = False) -> str:
    """
	Add soft hyphens to a string of XHTML.

	INPUTS
	xhtml: A string of XHTML
	language: An ISO language code, like en-US, or None to auto-detect based on XHTML input
	ignore_h_tags: True to not hyphenate within <h1-6> tags

	OUTPUTS
	A string of XHTML with soft hyphens inserted in words. The output is not guaranteed to be pretty-printed.
	"""

    hyphenators: Dict[str, Hyphenator] = {}
    soup = BeautifulSoup(xhtml, "lxml")

    if language is None:
        try:
            language = str(soup.html["xml:lang"])
        except Exception:
            try:
                language = str(soup.html["lang"])
            except Exception:
                raise se.InvalidLanguageException(
                    "No `xml:lang` or `lang` attribute on `<html>` element; couldn’t guess file language."
                )

    try:
        language = language.replace("-", "_")
        if language not in hyphenators:
            hyphenators[language] = Hyphenator(language)
    except Exception:
        raise se.MissingDependencyException(
            f"Hyphenator for language `{language}` not available.\nInstalled hyphenators: {list_installed()}"
        )

    text = str(soup.body)
    result = text
    word = ""
    in_tag = False
    tag_name = ""
    reading_tag_name = False
    in_h_tag = False
    pos = 1
    h_opening_tag_pattern = regex.compile("^h[1-6]$")
    h_closing_tag_pattern = regex.compile("^/h[1-6]$")

    # The general idea here is to read the whole contents of the <body> tag character by character.
    # If we hit a <, we ignore the contents until we hit the next >.
    # Otherwise, we consider a word to be an unbroken sequence of alphanumeric characters.
    # We can't just split at whitespace because HTML tags can contain whitespace (attributes for example)
    for char in text:
        process = False

        if char == "<":
            process = True
            in_tag = True
            reading_tag_name = True
            tag_name = ""
        elif in_tag and char == ">":
            in_tag = False
            reading_tag_name = False
            word = ""
        elif in_tag and char == " ":
            reading_tag_name = False
        elif in_tag and reading_tag_name:
            tag_name = tag_name + char
        elif not in_tag and char.isalnum():
            word = word + char
        elif not in_tag:
            process = True

        # Do we ignore <h1-6> tags?
        if not reading_tag_name and h_opening_tag_pattern.match(tag_name):
            in_h_tag = True

        if not reading_tag_name and h_closing_tag_pattern.match(tag_name):
            in_h_tag = False

        if ignore_h_tags and in_h_tag:
            process = False

        if process:
            if word != "":
                new_word = word

                # 100 is the hard coded max word length in the hyphenator module
                # Check here to avoid an error
                if len(word) < 100:
                    syllables = hyphenators[language].syllables(word)

                    if syllables:
                        new_word = "\u00AD".join(syllables)

                result = result[:pos - len(word) -
                                1] + new_word + char + result[pos:]
                pos = pos + len(new_word) - len(word)
            word = ""

        pos = pos + 1

    xhtml = regex.sub(r"<body.+<\/body>", "", xhtml, flags=regex.DOTALL)
    xhtml = xhtml.replace("</head>", "</head>\n\t" + result)

    return xhtml