예제 #1
0
def align(argv):
    """ Main function for this module (see its docstring for usage) """
    
    init_start_time = clock()
    
    __dir__ = path.realpath(path.dirname(__file__))
    sphinx_long_audio_aligner_repo_url = "http://cmusphinx.svn.sourceforge.net/svnroot/cmusphinx/branches/long-audio-aligner/Aligner"
    long_audio_aligner_path = __dir__ + '/long-audio-aligner'
    data_path = __dir__ + '/data/net'
    is_force = ('--force' in sys.argv or '-f' in sys.argv)
    book_args = filter(lambda arg: arg[0] != '-', argv)

    if book_args == '':
        book_args = 'Matt Mark Luke John Acts Rom'

    books = bookinfo.get_book_subset(book_args)
    
    # svn co http://cmusphinx.svn.sourceforge.net/svnroot/cmusphinx/branches/long-audio-aligner/Aligner aligner
    if not path.exists(long_audio_aligner_path):
        print "Fetching long-audio-aligner from Sourceforge..."
        subprocess.call(['svn', 'co', sphinx_long_audio_aligner_repo_url, long_audio_aligner_path])
    
    # Remove the initial batchFile since we don't want to process it anyway
    f = open(long_audio_aligner_path + '/resource/batchFile.txt', 'w')
    f.write('')
    f.close()
    
    # Build the Java project
    cwd = os.path.realpath(os.curdir)
    os.chdir(long_audio_aligner_path)
    print "Running ant"
    # retcode = subprocess.call(['ant'])
    # if retcode != 0:
    #   raise Exception("fail (have you ant?)")
    os.chdir(cwd)
    
    # Create the data directory which is where we put all the ESV data: audio, text, HTML, alignments
    if not path.exists(data_path):
        print "Making data dir"
        os.mkdir(data_path)
    
    def save_url(url, file, encoding=None):
        """Fetch a URL and save if to file, throwing exception if HTTP fail"""
        if encoding:
            fo = codecs.open(file, mode='w', encoding=encoding)
        else:
            fo = open(file, 'w')
        fi = urllib.urlopen(url)
        if not fi.getcode() or fi.getcode() != 200:
            raise Exception("Unable to fetch %s. Status code: %s" % (url, str(fi.getcode())))
        #fo.write(fi.read())
        fo.write( re.sub('<[^<]+?>[^<]+?</[^<]+?>', '', fi.read().replace("&#8211;","").replace("“","").replace("”","").replace("’","")  ))
        fo.close()
    
    for book in books:
        print "########################"
        print "%s (%s)" % (book.name, book.osis)
        print "########################"
        
        book_start_time = clock()
        
        for chapter in book.chapters:
            chapter_start_time = clock()
            
            mp3_file = data_path + '/%s.%d.mp3' % (book.osis, chapter)
            print "%s %d" % (book.osis, chapter)
            
            # Fetch MP3
            if not os.path.exists(mp3_file):
                mp3_url = 'https://net.bible.org/audio/get/{index}-{book}-{chapter}.mp3'.format(
                    index="{0:02d}".format(book.index),
                    book=book.name,
                    chapter="{0:02d}".format(chapter)
                )
                print "Downloading MP3"
                save_url(mp3_url, mp3_file)
            else:
                print "Skipping MP3 (already-fetched)"
                
            # Convert to WAV
            wav_file = mp3_file.replace('.mp3', '.wav')
            if not os.path.exists(wav_file):
                print "Generating WAV file from MP3"
                retcode = subprocess.call(['sox', mp3_file, wav_file, 'rate', '16k'])
                if retcode != 0:
                    raise Exception("fail (have you installed SoX?)")
            else:
                print "Skipping WAV (already-generated)"
            
            verseless_text_file = data_path + '/%s.%d.verseless.txt' % (book.osis, chapter)
            if not path.exists(verseless_text_file):
                print "Fetching verseless text"
                # @todo What is the character encoding of the response??
                params = {
                    'passage': '{book} {chapter}'.format(book=book.name, chapter=chapter),
                    'formatting': 'text',
                    'type': 'text',
                }
                text_url = 'http://labs.bible.org/api/?%s' % urllib.urlencode(params)
                save_url(text_url, verseless_text_file) #'utf-8')
                
            else:
                print "Skipping verseless text (already-fetched)"
            
            
            # Create batch file for this chapter
            f = open(long_audio_aligner_path + '/resource/batchFile.txt', 'w')
            f.write('../data/net/{book}.{chapter}.verseless.txt ../data/net/{book}.{chapter}.wav'.format(book=book.osis, chapter=chapter))
            f.close()
            
            # Now run the aligner on the batchFile
            timings_file = data_path + '/%s.%d.timings.json' % (book.osis, chapter)
            if not path.exists(timings_file) or is_force:
                print "Aligning text"
                
                cwd = path.realpath(path.curdir)
                os.chdir(long_audio_aligner_path)
                retcode = subprocess.call(['java', '-Xmx3g', '-jar', 'bin/aligner.jar'])
                if retcode != 0:
                    raise Exception("fail (haz Java?)")
                os.chdir(cwd)
                
                # Chapter word segments: split up the chapter into an OrderedDict where each verse is separate
                #fi = codecs.open(versed_text_file, mode='r', encoding='utf-8')
                #chapter_text = fi.read()
                #fi.close()
                
                # Split the text into words
                #chapter_text = re.sub(r'(\[\d+\])', r' \1 ', chapter_text)
                #unnormalized_word_chunks = chapter_text.strip().split()
                #unnormalized_word_chunks.insert(0, '[1]')
                
                # Obtain the timed output
                fi = codecs.open(long_audio_aligner_path + '/timedOutput/1.txt', encoding='utf-8')
                raw_timings = fi.read().split()
                fi.close()
                
                verse_timings = OrderedDict()
                word_timings = []
                
                # Parse the timings out of the raw timings, and then pair up the
                # normalized word from Sphinx with the actual word from the text
                normalize_word_chunk = lambda s: re.sub(r'\W', '', s).lower()
                stip_punc = lambda s: re.sub(r'^\W+|\W+$', '', s)
                current_verse = None
                for raw_timing in raw_timings:
                    matches = re.match(r'(.+)\((.+),(.+)\)', raw_timing)
                    word = matches.group(1)
                    #if word == '<unk>':
                    #    word = None
                    #else:
                    #    skipped_words = 0
                        #while True:
                            #unnormalized_word_chunk = unnormalized_word_chunks.pop(0)
                            
                            # Detect the verses
                            #if unnormalized_word_chunk.startswith('[') and unnormalized_word_chunk.endswith(']'):
                            #    current_verse = unnormalized_word_chunk.strip('[]')
                            #    verse_timings[current_verse] = {'start': None, 'end': None}
                            #    unnormalized_word_chunk = unnormalized_word_chunks.pop(0)
                            
                            #if word == normalize_word_chunk(unnormalized_word_chunk):
                            #    word = stip_punc(unnormalized_word_chunk)
                            #    break
                            # skipped_words.append(unnormalized_word_chunk)
                            # if len(skipped_words) > 5:
                            #    raise Exception("Skipping several words: " + ", ".join(skipped_words))
                    
                    start = float(matches.group(2))
                    end = float(matches.group(3))
                    
                    # Keep track of verse timings
                    #if verse_timings[current_verse]['start'] is None:
                    #    verse_timings[current_verse]['start'] = start
                    #verse_timings[current_verse]['end'] = end
                    
                    # Record word timings
                    #word_timings.append({
                    #    'word'  : word,
                    #    'start' : start,
                    #    'end'   : end,
                    #})
                    word_timings.append([word, start, end ])					
                
                fo = codecs.open(timings_file, mode='w', encoding='utf-8')
                #fo.write(json.dumps({'verses': verse_timings, 'words': word_timings}, indent=2))
                fo.write(json.dumps({'words': word_timings}, indent=1))
                fo.close()
            else:
                print "Text already aligned"
            
            print "Time: %.02fs" % (clock() - chapter_start_time)
            print "--"
        
        print "%s book execution time: %.02fs" % (book.name, clock() - book_start_time)
        
    print "Total execution time: %.02fs" % (clock() - init_start_time)
예제 #2
0
파일: align.py 프로젝트: JoeyLeeBh/aligner
def align(argv):
    """ Main function for this module (see its docstring for usage) """
    
    init_start_time = clock()
    
    __dir__ = path.realpath(path.dirname(__file__))
    sphinx_long_audio_aligner_repo_url = "http://cmusphinx.svn.sourceforge.net/svnroot/cmusphinx/branches/long-audio-aligner/Aligner"
    long_audio_aligner_path = __dir__ + '/long-audio-aligner'
    data_path = __dir__ + '/data'
    is_force = ('--force' in sys.argv or '-f' in sys.argv)
    book_args = filter(lambda arg: arg[0] != '-', argv)
    
    books = bookinfo.get_book_subset(book_args)
    
    # svn co http://cmusphinx.svn.sourceforge.net/svnroot/cmusphinx/branches/long-audio-aligner/Aligner aligner
    if not path.exists(long_audio_aligner_path):
        print "Fetching long-audio-aligner from Sourceforge..."
        subprocess.call(['svn', 'co', sphinx_long_audio_aligner_repo_url, long_audio_aligner_path])
    
    # Remove the initial batchFile since we don't want to process it anyway
    f = open(long_audio_aligner_path + '/resource/batchFile.txt', 'w')
    f.write('')
    f.close()
    
    # Build the Java project
    cwd = os.path.realpath(os.curdir)
    os.chdir(long_audio_aligner_path)
    print "Running ant"
    retcode = subprocess.call(['ant'])
    if retcode != 0:
        raise Exception("fail (have you ant?)")
    os.chdir(cwd)
    
    # Create the data directory which is where we put all the ESV data: audio, text, HTML, alignments
    if not path.exists(data_path):
        print "Making data dir"
        os.mkdir(data_path)
    
    def save_url(url, file, encoding=None):
        """Fetch a URL and save if to file, throwing exception if HTTP fail"""
        if encoding:
            fo = codecs.open(file, mode='w', encoding=encoding)
        else:
            fo = open(file, 'w')
        fi = urllib.urlopen(url)
        if not fi.getcode() or fi.getcode() != 200:
            raise Exception("Unable to fetch %s. Status code: %s" % (url, str(fi.getcode())))
        fo.write(fi.read())
        fo.close()
    
    for book in books:
        print "########################"
        print "%s (%s)" % (book.name, book.osis)
        print "########################"
        
        book_start_time = clock()
        
        for chapter in book.chapters:
            chapter_start_time = clock()
            
            mp3_file = data_path + '/%s.%d.mp3' % (book.osis, chapter)
            print "%s %d" % (book.osis, chapter)
            
            # Fetch MP3
            if not os.path.exists(mp3_file):
                mp3_url = 'http://www.esvapi.org/v2/rest/passageQuery?key=IP&output-format=mp3&passage={book}+{chapter}'.format(
                    book=book.name,
                    chapter=chapter
                )
                print "Downloading MP3"
                save_url(mp3_url, mp3_file)
            else:
                print "Skipping MP3 (already-fetched)"
                
            # Convert to WAV
            wav_file = mp3_file.replace('.mp3', '.wav')
            if not os.path.exists(wav_file):
                print "Generating WAV file from MP3"
                retcode = subprocess.call(['sox', mp3_file, wav_file, 'rate', '16k'])
                if retcode != 0:
                    raise Exception("fail (have you installed SoX?)")
            else:
                print "Skipping WAV (already-generated)"
            
            # Fetch text for chapter for Aligner, first verseless then versed
            text_params = {
                'key': 'IP',
                'output-format': 'plain-text',
                'passage': '{book} {chapter}'.format(book=book.name, chapter=chapter),
                'include-passage-references': 'false',
                'include-first-verse-numbers': 'false',
                'include-footnotes': 'false',
                'include-short-copyright': 'false',
                'include-passage-horizontal-lines': 'false',
                'include-heading-horizontal-lines': 'false',
                'include-headings': 'false',
                'include-subheadings': 'false',
                'include-selahs': 'true',
                'line-length': '0',
            }
            text_params['include-verse-numbers'] = 'false'
            verseless_text_file = data_path + '/%s.%d.verseless.txt' % (book.osis, chapter)
            if not path.exists(verseless_text_file):
                print "Fetching verseless text"
                # @todo What is the character encoding of the response??
                text_url = 'http://www.esvapi.org/v2/rest/passageQuery?%s' % urllib.urlencode(text_params)
                save_url(text_url, verseless_text_file, 'utf-8')
            else:
                print "Skipping verseless text (already-fetched)"
            
            text_params['include-verse-numbers'] = 'true'
            versed_text_file = data_path + '/%s.%d.versed.txt' % (book.osis, chapter)
            if not path.exists(versed_text_file):
                print "Fetching versed text"
                # @todo What is the character encoding of the response??
                text_url = 'http://www.esvapi.org/v2/rest/passageQuery?%s' % urllib.urlencode(text_params)
                save_url(text_url, versed_text_file, 'utf-8')
            else:
                print "Skipping versed text (already-fetched)"
            
            # Fetch HTML for chapter
            html_file = data_path + '/%s.%d.html' % (book.osis, chapter)
            if not path.exists(html_file):
                print "Fetching HTML"
                params = {
                    'key': 'IP',
                    'passage': '{book} {chapter}'.format(book=book.name, chapter=chapter),
                    'include-passage-references': 'false',
                    'include-first-verse-numbers': 'false',
                    'include-verse-numbers': 'true',
                    'include-footnotes': 'true',
                    'include-surrounding-chapters': 'false',
                    'include-audio-link': 'false',
                    'include-short-copyright': 'false',
                    'include-copyright': 'true',
                }
                html_url = 'http://www.esvapi.org/v2/rest/passageQuery?%s' % urllib.urlencode(params)
                save_url(html_url, html_file, 'utf-8')
            else:
                print "Skipping HTML (already-fetched)"
            
            # Create batch file for this chapter
            f = open(long_audio_aligner_path + '/resource/batchFile.txt', 'w')
            f.write('../data/{book}.{chapter}.verseless.txt ../data/{book}.{chapter}.wav'.format(book=book.osis, chapter=chapter))
            f.close()
            
            # Now run the aligner on the batchFile
            timings_file = data_path + '/%s.%d.timings.json' % (book.osis, chapter)
            if not path.exists(timings_file) or is_force:
                print "Aligning text"
                
                cwd = path.realpath(path.curdir)
                os.chdir(long_audio_aligner_path)
                retcode = subprocess.call(['java', '-Xmx3g', '-jar', 'bin/aligner.jar'])
                if retcode != 0:
                    raise Exception("fail (haz Java?)")
                os.chdir(cwd)
                
                # Chapter word segments: split up the chapter into an OrderedDict where each verse is separate
                fi = codecs.open(versed_text_file, mode='r', encoding='utf-8')
                chapter_text = fi.read()
                fi.close()
                
                # Split the text into words
                chapter_text = re.sub(r'(\[\d+\])', r' \1 ', chapter_text)
                unnormalized_word_chunks = chapter_text.strip().split()
                unnormalized_word_chunks.insert(0, '[1]')
                
                # Obtain the timed output
                fi = codecs.open(long_audio_aligner_path + '/timedOutput/1.txt', encoding='utf-8')
                raw_timings = fi.read().split()
                fi.close()
                
                verse_timings = OrderedDict()
                word_timings = []
                
                # Parse the timings out of the raw timings, and then pair up the
                # normalized word from Sphinx with the actual word from the text
                normalize_word_chunk = lambda s: re.sub(r'\W', '', s).lower()
                stip_punc = lambda s: re.sub(r'^\W+|\W+$', '', s)
                current_verse = None
                for raw_timing in raw_timings:
                    matches = re.match(r'(.+)\((.+),(.+)\)', raw_timing)
                    word = matches.group(1)
                    if word == '<unk>':
                        word = None
                    else:
                        skipped_words = 0
                        while True:
                            unnormalized_word_chunk = unnormalized_word_chunks.pop(0)
                            
                            # Detect the verses
                            if unnormalized_word_chunk.startswith('[') and unnormalized_word_chunk.endswith(']'):
                                current_verse = unnormalized_word_chunk.strip('[]')
                                verse_timings[current_verse] = {'start': None, 'end': None}
                                unnormalized_word_chunk = unnormalized_word_chunks.pop(0)
                            
                            if word == normalize_word_chunk(unnormalized_word_chunk):
                                word = stip_punc(unnormalized_word_chunk)
                                break
                            skipped_words.append(unnormalized_word_chunk)
                            if len(skipped_words) > 5:
                                raise Exception("Skipping several words: " + ", ".join(skipped_words))
                    
                    start = float(matches.group(2))
                    end = float(matches.group(3))
                    
                    # Keep track of verse timings
                    if verse_timings[current_verse]['start'] is None:
                        verse_timings[current_verse]['start'] = start
                    verse_timings[current_verse]['end'] = end
                    
                    # Record word timings
                    word_timings.append({
                        'word'  : word,
                        'start' : start,
                        'end'   : end,
                    })
                
                fo = codecs.open(timings_file, mode='w', encoding='utf-8')
                fo.write(json.dumps({'verses': verse_timings, 'words': word_timings}, indent=2))
                fo.close()
            else:
                print "Text already aligned"
            
            print "Time: %.02fs" % (clock() - chapter_start_time)
            print "--"
        
        print "%s book execution time: %.02fs" % (book.name, clock() - book_start_time)
        
    print "Total execution time: %.02fs" % (clock() - init_start_time)
예제 #3
0
#!/usr/bin/env python
"""
Report generator to help provide feedback to the CMU Sphinx project.
Usage: generate-reports.py [osisBook chapter[...]]...
"""

import json
import codecs
import sys
import re
import math
import bookinfo
import os

# Get args
books = bookinfo.get_book_subset(sys.argv[1:])

for bookinfo in books:
    for chapter in bookinfo.chapters:
        # Get timings
        timings_file = "data/%s.%d.timings.json" % (bookinfo.osis, chapter)
        if not os.path.exists(timings_file):
            continue
        with codecs.open(timings_file, encoding='utf-8') as f:
            timings = json.loads(f.read()).get('words')

        # Get the original
        text_file = "data/%s.%d.txt" % (bookinfo.osis, chapter)
        if not os.path.exists(text_file):
            continue
        with codecs.open(text_file, encoding='utf-8') as f:
#!/usr/bin/env python
"""
Report generator to help provide feedback to the CMU Sphinx project.
Usage: generate-reports.py [osisBook chapter[...]]...
"""

import json
import codecs
import sys
import re
import math
import bookinfo
import os

# Get args
books = bookinfo.get_book_subset(sys.argv[1:])

for bookinfo in books:
    for chapter in bookinfo.chapters:
        # Get timings
        timings_file = "data/%s.%d.timings.json" % (bookinfo.osis, chapter)
        if not os.path.exists(timings_file):
            continue
        with codecs.open(timings_file, encoding="utf-8") as f:
            timings = json.loads(f.read()).get("words")

        # Get the original
        text_file = "data/%s.%d.txt" % (bookinfo.osis, chapter)
        if not os.path.exists(text_file):
            continue
        with codecs.open(text_file, encoding="utf-8") as f: