Beispiel #1
0
def run_gentle(key='103/1241/103_1241_000000_000001'):
    text_file = f'{TRAIN_PATH}/{key}.normalized.txt'
    audio_file = f'{TRAIN_PATH}/{key}.wav'
    json_file = f'{TRAIN_PATH}/{key}.json'

    if os.path.isfile(json_file):
        with open(json_file) as r:
            return json.loads(r.read())

    with open(text_file, encoding="utf-8") as fh:
        transcript = fh.read()

    logging.info("converting audio to 8K sampled wav")

    def on_progress(p):
        return
        # for k,v in p.items():
        #    logging.debug("%s: %s" % (k, v))
    with gentle.resampled(audio_file) as wavfile:
        # logging.info("starting alignment")
        aligner = gentle.ForcedAligner(resources,
                                       transcript,
                                       nthreads=multiprocessing.cpu_count(),
                                       disfluency=False,  # include disfluencies (uh, um) in alignment
                                       conservative=False,
                                       disfluencies=set(['uh', 'um']))
        result = aligner.transcribe(wavfile, progress_cb=None, logging=logging)
        result_dict = result.to_dict()
        with open(json_file, 'w') as f:
            f.write(json.dumps(result_dict, indent=2))
        return result_dict
def align_words(audio, text):
    # resample audio to 8K
    audio_8k = librosa.resample(audio, 16000, 8000)
    wave_file = 'output/temp.wav'
    sf.write(wave_file, audio_8k, 8000, 'PCM_16')

    # run gentle to align words
    aligner = gentle.ForcedAligner(gentle_resources,
                                   text,
                                   nthreads=2,
                                   disfluency=False,
                                   conservative=False)
    gentle_out = aligner.transcribe(wave_file, logging=logging)
    words_with_timestamps = []
    for i, gentle_word in enumerate(gentle_out.words):
        if gentle_word.case == 'success':
            words_with_timestamps.append(
                [gentle_word.word, gentle_word.start, gentle_word.end])
        elif 0 < i < len(gentle_out.words) - 1:
            words_with_timestamps.append([
                gentle_word.word, gentle_out.words[i - 1].end,
                gentle_out.words[i + 1].start
            ])

    return words_with_timestamps
Beispiel #3
0
    def run_gentle(audio_path: str,
                   text_content: str,
                   tokenization_view: View = None):

        with gentle.resampled(audio_path) as audio_file:
            resources = gentle.Resources()
            aligner = gentle.ForcedAligner(
                resources,
                text_content,
                nthreads=multiprocessing.cpu_count(),
                disfluencies={'uh', 'um'},
                disfluency=True,
                conservative=False)
            if tokenization_view is not None:
                aligner.ms._seq = []
                for token in tokenization_view.get_annotations(Uri.TOKEN):
                    print(token.serialize(pretty=True))
                    start = token.properties['start']
                    end = token.properties['end']
                    token_text = text_content[start:end]
                    kaldi_token = {
                        'start':
                        start,
                        'end':
                        end,
                        'token':
                        metasentence.kaldi_normalize(token_text,
                                                     aligner.ms.vocab)
                    }
                    aligner.ms._seq.append(kaldi_token)
            result = aligner.transcribe(audio_file)
            return result
Beispiel #4
0
def run_gentle(seg, transcript):
    """
    Takes in a segment
    1. create new text file containing text
    2. create new audio with pydub
    3. run Gentle with these two
    4. delete text file/audio files

    Parameters
    ---------
    seg : Segment object to align with Gentle
    transcript : string holding the relevant transcript for this segment
    """
    audio_cut = seg.audio_file[1000 * seg.start_audio:1000 * seg.end_audio]

    audio_cut.export("temp_audio.wav", format="wav")

    # run Gentle
    resources = gentle.Resources()
    with gentle.resampled("temp_audio.wav") as wavfile:
        aligner = gentle.ForcedAligner(resources, transcript)
        result = aligner.transcribe(wavfile).words

    # delete cut audio file
    os.remove("temp_audio.wav")

    # fix unaligned-word start/end time data
    fix_unaligned(result, len(audio_cut) / 1000)

    # put gentle timestamps in relation to entire file
    for word in result:
        word.start += seg.start_audio
        word.end += seg.start_audio

    return result
Beispiel #5
0
    def get_gentle_response(self, parsed_txt_path):
        """Returns response from gentle
    
        Args:
            parsed_txt_path (str): parsed txt path

        Returns:
            list: aligned words
        """

        with open(parsed_txt_path, encoding="utf-8") as fh:
            transcript = fh.read()

        resources = gentle.Resources()
        # words for gentle to ignore when aligning
        disfluencies = set(['uh', 'um'])
        with gentle.resampled(self.audiopath) as wavfile:
            aligner = gentle.ForcedAligner(
                resources,
                transcript,
                nthreads=multiprocessing.cpu_count(),
                disfluency=False,
                conservative=False,
                disfluencies=disfluencies)
            result = aligner.transcribe(wavfile)

        return [word.as_dict(without="duration") for word in result.words]
Beispiel #6
0
def align_file(transcription, snd_filename):
    # TODO: Add a file of "sound files that did not finish"
    class TimeoutException(Exception):
        pass

    @contextmanager
    def time_limit(seconds):
        def signal_handler(signum, frame):
            raise TimeoutException("Timed out!")

        signal.signal(signal.SIGALRM, signal_handler)
        signal.alarm(seconds)
        try:
            yield
        finally:
            signal.alarm(0)

    with gentle.resampled(snd_filename) as wavfile:
        aligner = gentle.ForcedAligner(resources,
                                       transcription,
                                       nthreads=nthreads)
        try:
            with time_limit(10):
                result = {snd_filename: aligner.transcribe(wavfile)}
        except TimeoutException as e:
            print(
                "Transcription of {} timed out! Please check that your transcription is accurate."
                .format(fid))
            result = {snd_filename: None}
    return result
Beispiel #7
0
def align(audiopath, text, nthreads=1):
    resources = gentle.Resources()
    logging.info("converting audio to 8K sampled wav")
    with gentle.resampled(audiopath) as wavfile:
        logging.info("Starting alignment")
        aligner = gentle.ForcedAligner(resources, text, nthreads=nthreads, disfluency=False, 
                                        conservative=False)
        return aligner.transcribe(wavfile, progress_cb=on_progress, logging=logging)    
Beispiel #8
0
def align(audiopath, transcript):
    logging.info("converting audio to 8K sampled wav")
    with gentle.resampled(audiopath) as wavfile:
        logging.info("starting alignment")
        aligner = gentle.ForcedAligner(resources,
                                       transcript,
                                       nthreads=nthreads,
                                       disfluency=False,
                                       conservative=False)
        return json.loads(
            aligner.transcribe(wavfile,
                               progress_cb=on_progress,
                               logging=logging).to_json())
def align_audio(wav_path, transcript):
    with gentle.resampled(wav_path) as wavfile:
        print("starting alignment {}".format(wav_path))
        aligner = gentle.ForcedAligner(RESOURCES,
                                       transcript,
                                       nthreads=N_THREADS,
                                       disfluency=False,
                                       conservative=False,
                                       disfluencies=DISFLUENCIES)
        result = aligner.transcribe(wavfile,
                                    progress_cb=_on_progress,
                                    logging=logging)
        result_json = json.loads(result.to_json())

    return result_json
Beispiel #10
0
def run_gentle(video_path, vid, result_path):
    vtt_subtitle = read_subtitle(vid)
    transcript = ''
    for i, sub in enumerate(vtt_subtitle):
        transcript += (vtt_subtitle[i].text + ' ')
    transcript = re.sub('\n', ' ', transcript)  # remove newline characters

    # align
    with gentle.resampled(video_path) as wav_file:
        aligner = gentle.ForcedAligner(resources, transcript, nthreads=nthreads, disfluency=False, conservative=False,
                                       disfluencies=disfluencies)
        result = aligner.transcribe(wav_file, logging=logging)

    # write results
    with open(result_path, 'w', encoding="utf-8") as fh:
        fh.write(result.to_json(indent=2))
Beispiel #11
0
def align(args):
    audiopath, transcript = args
    with gentle.resampled(audiopath) as wavfile:
        logging.info("Audio file: {}".format(audiopath))
        logging.info("Transcript: <{}...>".format(transcript[:40]))
        aligner = gentle.ForcedAligner(resources,
                                       transcript,
                                       nthreads=1,
                                       disfluency=False,
                                       conservative=False)
        result = json.loads(
            aligner.transcribe(wavfile,
                               progress_cb=on_progress,
                               logging=logging).to_json())
        result['audiopath'] = audiopath
        return result
Beispiel #12
0
def align_db(data):
    import pathlib
    except_i_list = list(range(len(data)))

    while True:
        for i in tqdm(except_i_list):
            row = data.iloc[i]
            f = row.sentence_path
            transcript = row.transcription
            with gentle.resampled(f) as wavfile:
                aligner = gentle.ForcedAligner(resources,
                                               transcript,
                                               nthreads=40)
                #print("Align starts")
                try:
                    result = aligner.transcribe(wavfile,
                                                progress_cb=on_progress,
                                                logging=logging)
                except:
                    except_i_list.append(i)

                    continue
                #print("Align ends")
            # os.system('python align.py '+f+' words.txt -o test.json')

            output = os.path.join(
                'alignments',
                '/'.join(f.split('/')[-4:]).split('.')[0] + '.json')
            pathlib.Path('/'.join(output.split('/')[0:-1])).mkdir(
                parents=True, exist_ok=True)

            fh = open(output, 'w')
            #print("{} starts to be written".format(output))
            fh.write(result.to_json(indent=2))
            #print("{} written".format(output))
            if output:
                logging.info("output written to %s" % (output))

            fh.close()
            #print("i={}".format(i))
            #print("f={}".format(row.sentence_path))

        print("except_i_list:", except_i_list)
        if len(except_i_list) == 0:
            break
Beispiel #13
0
def call_gentle_chunk(wav_path,
                      transcript,
                      disfluency=False,
                      conservative=False):
    """"""
    resources = gentle.Resources()

    aligner = gentle.ForcedAligner(resources,
                                   transcript,
                                   nthreads=multiprocessing.cpu_count(),
                                   disfluency=disfluency,
                                   conservative=conservative,
                                   disfluencies=('uh', 'um'))

    result = aligner.transcribe(wav_path,
                                progress_cb=_on_progress,
                                logging=logging)

    return json.loads(result.to_json())
Beispiel #14
0
    def transcribe(self, output_dir, **kwargs):
        orig_audio = os.path.join(output_dir, AUDIO_FILENAME)
        resample_audio = os.path.join(output_dir, RESAMPLE_FILENAME)
        transcript = os.path.join(output_dir, TEXT_FILENAME)

        logging.info('Resampling audio file %s', orig_audio)
        if gentle.resample(orig_audio, resample_audio) != 0:
            logging.info('Failed to resample %s', orig_audio)
            return -1

        def on_progress(p):
            for k,v in p.items():
                logging.info('Transcribing %s, %s, %s', resample_audio, k, v)

        logging.info('Starting to transcribe %s', output_dir)
        with open(transcript, 'r', encoding='utf-8') as file:
            text = file.read()
            transcriber = gentle.ForcedAligner(self.resources, text, nthreads=self.nthreads, **kwargs)
            output = transcriber.transcribe(resample_audio, progress_cb=on_progress, logging=logging)
        logging.info('Finished transcribing %s', output_dir)
        return output
Beispiel #15
0
def get_transcribed_words(textFile, audioFile):
    with open(textFile) as file:
        transcript = file.read()

    resources = gentle.Resources()

    with gentle.resampled(audioFile) as wavfile:
        aligner = gentle.ForcedAligner(resources, transcript)
        result = aligner.transcribe(wavfile)

    transcribed_words = []
    for word in result.words:
        phones = word.phones
        if phones is not None:
            root_phones = []
            for phone in phones:
                root_phone = phone['phone'][0:phone['phone'].index('_')]
                root_phones.append(root_phone)
            transcribed_words.append(Word(word.word, root_phones))

    return transcribed_words
Beispiel #16
0
def align_onefile(data, align_json_path):
    import pathlib

    splitted_path = split_path(align_json_path)
    json_file_name = splitted_path[-1]
    id, _ = os.path.splitext(json_file_name)
    emotion = splitted_path[-2]
    speaker = splitted_path[-3]

    row = data[(data.id == id)
               & (data.speaker == speaker)].iloc[0]  # iloc: df to series
    f = row.sentence_path
    transcript = row.transcription
    with gentle.resampled(f) as wavfile:
        aligner = gentle.ForcedAligner(resources, transcript, nthreads=40)
        #print("Align starts")
        try:
            result = aligner.transcribe(wavfile,
                                        progress_cb=on_progress,
                                        logging=logging)
        except:
            return
            #except_i_list.append(i)
            #print("except_i_list:", except_i_list)
        #print("Align ends")
    # os.system('python align.py '+f+' words.txt -o test.json')

    output = os.path.join('alignments',
                          '/'.join(f.split('/')[-4:]).split('.')[0] + '.json')
    pathlib.Path('/'.join(output.split('/')[0:-1])).mkdir(parents=True,
                                                          exist_ok=True)

    fh = open(output, 'w')
    #print("{} starts to be written".format(output))
    fh.write(result.to_json(indent=2))
    #print("{} written".format(output))
    if output:
        logging.info("output written to %s" % (output))

    fh.close()
 def gentle_solve(self, audio_path, transcript):
     """
     gentle wrapper to solve the forced alignment given audio file and text string 
     """
     args = {
         'log': 'INFO',
         'nthreads': self.num_thread,
         'conservative': True,
         'disfluency': True,
     }
     disfluencies = set(['uh', 'um'])
     resources = gentle.Resources()
     with gentle.resampled(audio_path) as wavfile:
         aligner = gentle.ForcedAligner(
             resources,
             transcript,
             nthreads=args['nthreads'],
             disfluency=args['disfluency'],
             conservative=args['conservative'],
             disfluencies=disfluencies)
         result = aligner.transcribe(wavfile)
     return [word.as_dict(without="phones") for word in result.words]
Beispiel #18
0
def align_db(data):
    import pathlib

    for i, row in data.iterrows():
        f = row.sentence_path
        transcript = row.transcription
        with gentle.resampled(f) as wavfile:
            aligner = gentle.ForcedAligner(resources, transcript)
            result = aligner.transcribe(wavfile,
                                        progress_cb=on_progress,
                                        logging=logging)
        # os.system('python align.py '+f+' words.txt -o test.json')

        output = os.path.join(
            'alignments', '/'.join(f.split('/')[-4:]).split('.')[0] + '.json')
        pathlib.Path('/'.join(output.split('/')[0:-1])).mkdir(parents=True,
                                                              exist_ok=True)

        fh = open(output, 'w')
        fh.write(result.to_json(indent=2))
        if output:
            logging.info("output written to %s" % (output))

        fh.close()
def start_aligning(audiofile, txtfile, output):
    log_level = "INFO"  #can be one of the following: (DEBUG, INFO, WARNING, ERROR, or CRITICAL)
    logging.getLogger().setLevel(log_level)

    disfluencies = set(['uh', 'um'])

    with open(txtfile, encoding="utf-8") as fh:
        transcript = fh.read()

    resources = gentle.Resources()
    logging.info("converting audio to 8K sampled wav")

    with gentle.resampled(audiofile) as wavfile:
        logging.info("starting alignment")
        aligner = gentle.ForcedAligner(
            resources, transcript, nthreads
        )  #, True, False, disfluencies)#, conservative, disfluencies)
        result = aligner.transcribe(wavfile,
                                    progress_cb=on_progress,
                                    logging=logging)

    fh = open(output, 'w', encoding="utf-8")
    fh.write(result.to_json(indent=2))
    logging.info("output written to %s" % (output))
Beispiel #20
0
    audio_path = os.path.join(DIR_PATH, "audio", audio_name)
    transcript_path = os.path.join(DIR_PATH, "transcripts", transcript_name)

    if not os.path.isfile(transcript_path):
        continue

    # get transcript text
    transcript_text = ""
    with open(transcript_path) as f:
        transcript_text = f.read()

    # run Gentle
    print "Running Gentle on", transcript_name
    resources = gentle.Resources()
    with gentle.resampled(audio_path) as wavfile:
        aligner = gentle.ForcedAligner(resources, transcript_text)
        result = aligner.transcribe(wavfile).words

    # create gentle_results directory if it doesn't already exist
    # usually better to use try-catch here, but not worried about race conditions right now
    if not os.path.exists(os.path.join(DIR_PATH, "gentle_results")):
        os.makedirs(os.path.join(DIR_PATH, "gentle_results"))

    # write Gentle output to gentle_results directory
    with open(
            os.path.join(DIR_PATH, "gentle_results", transcript_name + ".txt"),
            "w") as f:
        output = []
        for word in result:
            output.append({
                "word": word.word,
Beispiel #21
0
def on_progress(p):
    for k, v in p.items():
        logging.debug("%s: %s" % (k, v))


with open(args.txtfile) as fh:
    transcript = fh.read()

resources = gentle.Resources(args.model_dir)
config = resources.getConfig()
logging.info("converting audio to {} sampled wav".format(config['samplerate']))

with gentle.resampled(args.audiofile) as wavfile:
    logging.info("starting alignment")
    aligner = gentle.ForcedAligner(resources,
                                   transcript,
                                   nthreads=args.nthreads,
                                   context_width=config['context-width'],
                                   disfluency=args.disfluency,
                                   conservative=args.conservative,
                                   disfluencies=disfluencies)
    result = aligner.transcribe(wavfile,
                                progress_cb=on_progress,
                                logging=logging)

fh = open(args.output, 'w') if args.output else sys.stdout
fh.write(result.to_json(indent=2))
if args.output:
    logging.info("output written to %s" % (args.output))
Beispiel #22
0
class Transcriber():
    def __init__(self, data_dir, nthreads=4, ntranscriptionthreads=2):
        self.data_dir = data_dir
        self.nthreads = nthreads
        self.ntranscriptionthreads = ntranscriptionthreads
        self.resources = gentle.Resources()

        self.full_transcriber = gentle.FullTranscriber(self.resources, nthreads=ntranscriptionthreads)
        self._status_dicts = {}

    def get_status(self, uid):
        return self._status_dicts.setdefault(uid, {})

    def out_dir(self, uid):
        return os.path.join(self.data_dir, 'transcriptions', uid)

    # TODO(maxhawkins): refactor so this is returned by transcribe()
    def next_id(self):
        uid = None
        while uid is None or os.path.exists(os.path.join(self.data_dir, uid)):
            uid = uuid.uuid4().get_hex()[:8]
        return uid

    def transcribe(self, uid, transcript, audio, async, **kwargs):

        status = self.get_status(uid)

        status['status'] = 'STARTED'
        output = {
            'transcript': transcript
        }

        outdir = os.path.join(self.data_dir, 'transcriptions', uid)

        tran_path = os.path.join(outdir, 'transcript.txt')
        with open(tran_path, 'w') as tranfile:
            tranfile.write(transcript)
        audio_path = os.path.join(outdir, 'upload')
        with open(audio_path, 'w') as wavfile:
            wavfile.write(audio)

        status['status'] = 'ENCODING'

        wavfile = os.path.join(outdir, 'a.wav')
        if gentle.resample(os.path.join(outdir, 'upload'), wavfile) != 0:
            status['status'] = 'ERROR'
            status['error'] = "Encoding failed. Make sure that you've uploaded a valid media file."
            # Save the status so that errors are recovered on restart of the server
            # XXX: This won't work, because the endpoint will override this file
            with open(os.path.join(outdir, 'status.json'), 'w') as jsfile:
                json.dump(status, jsfile, indent=2)
            return

        #XXX: Maybe we should pass this wave object instead of the
        # file path to align_progress
        wav_obj = wave.open(wavfile, 'r')
        status['duration'] = wav_obj.getnframes() / float(wav_obj.getframerate())
        status['status'] = 'TRANSCRIBING'

        def on_progress(p):
            for k,v in p.items():
                status[k] = v

        if len(transcript.strip()) > 0:
            trans = gentle.ForcedAligner(self.resources, transcript, nthreads=self.nthreads, **kwargs)
        elif self.full_transcriber.available:
            trans = self.full_transcriber
        else:
            status['status'] = 'ERROR'
            status['error']  = 'No transcript provided and no language model for full transcription'
            return

        output = trans.transcribe(wavfile, progress_cb=on_progress, logging=logging)

        # ...remove the original upload
        os.unlink(os.path.join(outdir, 'upload'))

        # Save
        with open(os.path.join(outdir, 'align.json'), 'w') as jsfile:
            jsfile.write(output.to_json(indent=2))
        with open(os.path.join(outdir, 'align.csv'), 'w') as csvfile:
            csvfile.write(output.to_csv())

        # Inline the alignment into the index.html file.
        htmltxt = open(get_resource('www/view_alignment.html')).read()
        htmltxt = htmltxt.replace("var INLINE_JSON;", "var INLINE_JSON=%s;" % (output.to_json()));
        open(os.path.join(outdir, 'index.html'), 'w').write(htmltxt)

        status['status'] = 'OK'

        logging.info('done with transcription.')

        return output
Beispiel #23
0
    def transcribe(self, uid, transcript, audio, async_mode, **kwargs):

        status = self.get_status(uid)

        status['status'] = 'STARTED'
        output = {'transcript': transcript}

        outdir = os.path.join(self.data_dir, 'transcriptions', uid)

        tran_path = os.path.join(outdir, 'transcript.txt')
        with open(tran_path, 'w') as tranfile:
            tranfile.write(transcript)
        if not isinstance(audio, str):
            audio_path = os.path.join(outdir, 'upload')
            with open(audio_path, 'wb') as wavfile:
                wavfile.write(audio)

        status['status'] = 'ENCODING'

        wavfile = os.path.join(outdir, 'a.wav')
        # if ((not isinstance(audio, str)) and gentle.resample(os.path.join(outdir, 'upload'), wavfile) != 0) or gentle.resample(audio, wavfile) != 0:
        if (not isinstance(audio, str)) and gentle.resample(
                os.path.join(outdir, 'upload'), wavfile) != 0:
            status['status'] = 'ERROR'
            status[
                'error'] = "Encoding failed. Make sure that you've uploaded a valid media file."
            # Save the status so that errors are recovered on restart of the server
            # XXX: This won't work, because the endpoint will override this file
            with open(os.path.join(outdir, 'status.json'), 'w') as jsfile:
                json.dump(status, jsfile, indent=2)
            return

        if isinstance(audio, str) and gentle.resample(audio, wavfile) != 0:
            status['status'] = 'ERROR'
            status[
                'error'] = "Encoding failed. Make sure that you've referenced a valid media URL."
            # Save the status so that errors are recovered on restart of the server
            # XXX: This won't work, because the endpoint will override this file
            with open(os.path.join(outdir, 'status.json'), 'w') as jsfile:
                json.dump(status, jsfile, indent=2)
            return

        # XXX: Maybe we should pass this wave object instead of the
        # file path to align_progress
        if not isinstance(audio, str):
            wav_obj = wave.open(wavfile, 'rb')
            status['duration'] = wav_obj.getnframes() / \
                float(wav_obj.getframerate())
        status['status'] = 'TRANSCRIBING'

        def on_progress(p):
            print(p)
            for k, v in p.items():
                status[k] = v

        if len(transcript.strip()) > 0:
            trans = gentle.ForcedAligner(self.resources,
                                         transcript,
                                         nthreads=self.nthreads,
                                         **kwargs)
        elif self.full_transcriber.available:
            trans = self.full_transcriber
        else:
            status['status'] = 'ERROR'
            status[
                'error'] = 'No transcript provided and no language model for full transcription'
            return

        output = trans.transcribe(wavfile,
                                  progress_cb=on_progress,
                                  logging=logging)

        # ...remove the original upload
        if not isinstance(audio, str):
            os.unlink(os.path.join(outdir, 'upload'))

        # Save
        with open(os.path.join(outdir, 'align.json'), 'w') as jsfile:
            jsfile.write(output.to_json(indent=2))
        with open(os.path.join(outdir, 'align.csv'), 'w') as csvfile:
            csvfile.write(output.to_csv())

        # Inline the alignment into the index.html file.
        htmltxt = open(get_resource('www/view_alignment.html')).read()
        htmltxt = htmltxt.replace("var INLINE_JSON;",
                                  "var INLINE_JSON=%s;" % (output.to_json()))
        open(os.path.join(outdir, 'index.html'), 'w').write(htmltxt)

        status['status'] = 'OK'

        logging.info('done with transcription.')

        return output
Beispiel #24
0
def generate_diphones(audio_file,
                      transcript_file,
                      output_folder,
                      pre_padding=0.0,
                      post_padding=0.0) -> set:
    """Generates the list of diphones for a given audio_file using the transcript and store the diphones in the
    output_folder
    Args:
        :param audio_file:(str) Name of the audio file to segment (.wav)
        :param transcript_file:(str) Name of the text file with the transcript
        :param output_folder:(str) Name of the destination directory to store the diphones
        :param pre_padding:(float) A fraction of audio to clip before the generated diphone
        :param post_padding:(float) A fraction of audio to clip after the generated diphone
    Returns:
        :return set of generated diphones
    """
    nthreads = multiprocessing.cpu_count()
    disfluency = False
    conservative = False
    disfluencies = {'uh', 'um'}

    with open(transcript_file, encoding="utf-8") as fh:
        transcript = fh.read()
        print(transcript)
    resources = gentle.Resources()

    with gentle.resampled(audio_file) as wavfile:
        aligner = gentle.ForcedAligner(resources,
                                       transcript,
                                       nthreads=nthreads,
                                       disfluency=disfluency,
                                       conservative=conservative,
                                       disfluencies=disfluencies)
        result = aligner.transcribe(wavfile)
        r = json.loads(result.to_json())

    phone_time_list = []
    diphones = set()
    for word in r['words']:
        start = word['start'] * 1000
        for phone in word['phones']:
            diphones.add(phone['phone'])
            phone_time_list.append(
                [phone['phone'], start, start + phone['duration'] * 1000])
            start = start + phone['duration'] * 1000

    for entry in phone_time_list:
        diphone = segment_audio(audio_file, entry[1], entry[2], pre_padding,
                                post_padding)
        # print('Old ' + str(entry[0]) + ':' + str(len(diphone)))
        if len(diphone) < 150:
            try:
                diphone = ensure_length(diphone, 150)
            except exceptions.CouldntDecodeError:
                print(
                    entry[0],
                    'is very small.........................................................'
                )
        if not os.path.exists(output_folder):
            os.mkdir(output_folder)
        output_filename = output_folder + '/' + str(entry[0]) + '.wav'
        diphone.export(output_filename, format='wav')
        print('New ' + str(entry[0]) + ':' + str(len(diphone)))
    return diphones
Beispiel #25
0
def data_generator(file_id, min_dur=2, max_dur=(5, 20), randomize=False):
    """Given a file id and random seed, align the audio and text versions after
    dividing into single-speaker utterances, and write out texts of unbroken
    captured strings and their corresponding audio segments when the latter are
    between 2 and max_length seconds.
    """

    if randomize:
        seed = ord(file_id[-1])
        random.seed(seed)
        max_length = random.randint(max_dur[0], max_dur[1])
    else:
        max_length = max_dur[1]

    logger.info("Processing file id {}...".format(file_id))

    # grab audio file from s3
    mp3 = os.path.join(mp3_dir, "{}.mp3".format(file_id))
    wav = os.path.join(mp3_dir, "{}.wav".format(file_id))

    if not os.path.isfile(wav):
        if not os.path.isfile(mp3):
            bucket = boto3.resource("s3").Bucket("cgws")
            logger.info("Downloading file {} from S3...".format(file_id))
            try:
                bucket.download_file("{}.mp3".format(file_id), mp3)
            except:
                logger.warning(
                    "Could not download file {} from S3.".format(file_id))
                return

        FNULL = open(os.devnull, 'w')
        subprocess.call([
            "sox", "{}".format(mp3), "-r", "16k", "{}".format(wav), "remix",
            "-"
        ],
                        stdout=FNULL,
                        stderr=FNULL)

    # transcript
    txt_file = os.path.join(records_dir, "{}.txt".format(file_id))
    logger.info("Reading transcript {}...".format(file_id))
    try:
        with open(txt_file, "r") as tr:
            transcript = tr.read()
    except IOError:
        logger.warning("File {} does not exist.".format(txt_file))
        return

    # split transcript by speaker, and get timestamps (as seconds)
    # of the boundaries of each paragraph
    logger.info("Splitting transcript by speaker...")
    paragraphs = []
    times = []
    for paragraph in transcript.split("\n"):
        catch = re.match("\d:\d+:\d+\.\d", paragraph)
        if catch:
            timestamp = catch.group()
            h, m, s = timestamp.split(":")
            time = int(h) * 60 * 60 + int(m) * 60 + float(s)
            paragraphs.append(paragraph)
            times.append(time)
    file_end = get_duration(mp3)
    times.append(file_end)

    total_captures, captures_dur = 0, 0

    # taking one speaker at a time, find unbroken alignments up to max_length
    # and write out corresponding files
    for i, paragraph in enumerate(paragraphs):
        logger.info("Cleaning and trimming paragraph {}: \n{}".format(
            i, paragraph))

        paragraph_start, paragraph_end = times[i], times[i + 1]
        # don't bother with short files
        if paragraph_end - paragraph_start < min_dur:
            logger.info("Skipping paragraph {} (too short)...".format(i))
            continue
        if len(paragraph.split()) < 2:
            logger.info("Skipping paragraph {} (too few words)...".format(i))
            continue

        temp_wav = trim(file_id, wav, paragraph_start, paragraph_end, 0,
                        "/tmp")

        # unique name of json object to read/write
        paragraph_hash = hashlib.sha1("{}{}{}{}".format(
            file_id, paragraph, paragraph_start, paragraph_end)).hexdigest()

        if use_filename_json is True:
            json_file = os.path.join(
                json_out_dir, "{}_{}_{}.json".format(file_id, paragraph_start,
                                                     paragraph_end))
        else:
            json_file = os.path.join(json_out_dir,
                                     "{}.json".format(paragraph_hash))

        result = None

        # check if json object has been written from a previous run
        if not os.path.isfile(json_file):
            logger.info(
                "JSON file with hash {} not found.".format(paragraph_hash))

            try:
                logger.info("Resampling paragraph {}...".format(i))
                with gentle.resampled(temp_wav) as wav_file:
                    resources = gentle.Resources()
                    cleaned = clean(paragraph)
                    logger.info(
                        "Aligning paragraph {} with gentle...".format(i))
                    aligner = gentle.ForcedAligner(
                        resources,
                        cleaned,
                        nthreads=multiprocessing.cpu_count(),
                        disfluency=False,
                        conservative=False,
                        disfluencies=set(["uh", "um"]))
                    logger.info(
                        "Transcribing audio segment {} with gentle...".format(
                            i))
                    result = aligner.transcribe(wav_file)
            except:
                logger.warning("Paragraph {} - {} ".format(
                    i,
                    sys.exc_info()[2]))
                os.remove(temp_wav)
                continue

            aligned_words = result.to_json()
            with open(json_file, "w") as f:
                f.write(aligned_words)

            if not result:
                logger.info("Empty result for paragraph {}.".format(i))
                os.remove(temp_wav)
                continue

        else:
            logger.info(
                "Found JSON of paragraph {} -- skipping alignment and transcription by gentle"
                .format(i))

        # dictionary of aligned words
        with open(json_file) as f:
            aligned = json.loads(f.read())

        # save all consecutively captured strings
        # and keep track of their start and stop times
        captures = []
        current, start_time, end_time = [], 0, 0

        # loop through every word as returned from gentle
        logger.info("Capturing strings in paragraph {}...".format(i))

        if not "words" in aligned:
            logger.info("No words in paragraph {}.".format(i))
            os.remove(temp_wav)
            continue

        # first two seconds will be skipped even if it contains a capture
        for catch in aligned["words"]:
            # successful capture
            if catch["case"] == "success" and catch[
                    "alignedWord"] != "<unk>" and catch[
                        'start'] > 5 and catch['end'] - catch['start'] > .07:

                # new capture group
                if not current:
                    # begin capturing if it has been two seconds since the last word
                    if catch["start"] - end_time > 1:
                        current = [catch["alignedWord"]]
                        start_time = catch["start"]
                        end_time = catch["end"]

                # continuation of a capture group
                else:
                    # large gap between last capture and this one
                    # likely that something was missing in the transcript
                    if catch["start"] - end_time > 1:
                        save_capture(captures, start_time, end_time, current)
                        current = []

                    # adding this word would equal or exceed max_length
                    elif catch["end"] - start_time >= max_length:
                        save_capture(captures, start_time, end_time, current,
                                     min_dur)
                        current = []
                        if randomize:
                            max_length = random.randint(max_dur[0], max_dur[1])

                    # continue capturing
                    else:
                        current.append(catch["alignedWord"])
                        end_time = catch["end"]

            # a miss after prior success(es)
            elif current:
                save_capture(captures, start_time, end_time, current, min_dur)
                current = []

        # last word was a success but current capture hasn't been saved yet
        if current:
            save_capture(captures, start_time, end_time, current, min_dur)

        # write strings and split audio into consituent segments
        logger.info(
            "Writing text and audio segments from paragraph {}...".format(i))
        for result in captures:
            txt_segment = os.path.join(
                text_out_dir, "{}_{}_{}.txt".format(
                    file_id,
                    "{:07d}".format(int((times[i] + result["start"]) * 100)),
                    "{:07d}".format(int((times[i] + result["end"]) * 100))))
            with open(txt_segment, "w") as f:
                f.write("{}\n".format(result["string"]))

            segment = trim(file_id, temp_wav, result["start"], result["end"],
                           times[i], wav_out_dir)
            # make sure durations match
            segment_dur = get_duration(segment)
            assert segment_dur - result["duration"] <= .01

            total_captures += 1
            captures_dur += segment_dur

        # delete the clip of this speaker
        os.remove(temp_wav)

    # per-file logging
    total_dur = get_duration(mp3)
    logger.info("Wrote {} segments from {}, totalling {} seconds, out of a possible {}, ratio {:.2f}."\
          .format(total_captures,file_id,captures_dur,total_dur,captures_dur/total_dur))

    return
            if not os.path.isfile(json_file):

                temp_wav = trim(file_id, wav, paragraph_start, paragraph_end,
                                0, "/tmp")

                if not os.path.isfile(temp_wav):
                    continue

                try:
                    with gentle.resampled(temp_wav) as wav_file:
                        resources = gentle.Resources()
                        cleaned = clean(paragraph)
                        aligner = gentle.ForcedAligner(
                            resources,
                            cleaned,
                            nthreads=multiprocessing.cpu_count() *
                            args.threads_multiplier,
                            disfluency=False,
                            conservative=False,
                            disfluencies=set(["uh", "um"]))
                        result = aligner.transcribe(wav_file)

                        aligned_words = result.to_json()
                        with open(json_file, "w") as f:
                            f.write(aligned_words)

                except:
                    exc_type, exc_value, exc_traceback = sys.exc_info()
                    lines = traceback.format_exception(exc_type, exc_value,
                                                       exc_traceback)
                    print ''.join(line for line in lines)
                    continue
Beispiel #27
0
disfluencies = set(['uh', 'um'])


def on_progress(p):
    for k, v in p.items():
        logging.debug("%s: %s" % (k, v))


with open(args.txtfile) as fh:
    transcript = fh.read()

resources = gentle.Resources()
logging.info("converting audio to 8K sampled wav")

with gentle.resampled(args.audiofile) as wavfile:
    logging.info("starting alignment")
    aligner = gentle.ForcedAligner(resources,
                                   transcript,
                                   nthreads=args.nthreads,
                                   disfluency=args.disfluency,
                                   conservative=args.conservative,
                                   disfluencies=disfluencies)
    result = aligner.transcribe(wavfile,
                                progress_cb=on_progress,
                                logging=logging)

fh = open(args.output, 'w') if args.output else sys.stdout
fh.write(result.to_json(indent=2))
if args.output:
    logging.info("output written to %s" % (args.output))
Beispiel #28
0
class Transcriber():
    def __init__(self, data_dir, nthreads=4, ntranscriptionthreads=2):
        self.data_dir = data_dir
        self.nthreads = nthreads
        self.ntranscriptionthreads = ntranscriptionthreads
        self.resources = gentle.Resources()

        self.full_transcriber = gentle.FullTranscriber(
            self.resources, nthreads=ntranscriptionthreads)
        self._status_dicts = {}

    def get_status(self, uid):
        return self._status_dicts.setdefault(uid, {})

    def out_dir(self, uid):
        return os.path.join(self.data_dir, 'transcriptions', uid)

    # TODO(maxhawkins): refactor so this is returned by transcribe()
    def next_id(self):
        uid = None
        while uid is None or os.path.exists(os.path.join(self.data_dir, uid)):
            uid = uuid.uuid4().get_hex()[:8]
        return uid

    def transcribe(self, uid, transcript, audio, async, **kwargs):

        status = self.get_status(uid)

        status['status'] = 'STARTED'
        output = {'transcript': transcript}

        outdir = os.path.join(self.data_dir, 'transcriptions', uid)

        tran_path = os.path.join(outdir, 'transcript.txt')
        with open(tran_path, 'w') as tranfile:
            tranfile.write(transcript)
        audio_path = os.path.join(outdir, 'upload')
        with open(audio_path, 'w') as wavfile:
            wavfile.write(audio)

        status['status'] = 'ENCODING'

        wavfile = os.path.join(outdir, 'a.wav')
        if gentle.resample(os.path.join(outdir, 'upload'), wavfile) != 0:
            status['status'] = 'ERROR'
            status[
                'error'] = "Encoding failed. Make sure that you've uploaded a valid media file."
            # Save the status so that errors are recovered on restart of the server
            # XXX: This won't work, because the endpoint will override this file
            with open(os.path.join(outdir, 'status.json'), 'w') as jsfile:
                json.dump(status, jsfile, indent=2)
            return

        #XXX: Maybe we should pass this wave object instead of the
        # file path to align_progress
        wav_obj = wave.open(wavfile, 'r')
        status['duration'] = wav_obj.getnframes() / float(
            wav_obj.getframerate())
        status['status'] = 'TRANSCRIBING'

        def on_progress(p):
            for k, v in p.items():
                status[k] = v

        if len(transcript.strip()) > 0:
            trans = gentle.ForcedAligner(self.resources,
                                         transcript,
                                         nthreads=self.nthreads,
                                         **kwargs)
        elif self.full_transcriber.available:
            trans = self.full_transcriber
        else:
            status['status'] = 'ERROR'
            status[
                'error'] = 'No transcript provided and no language model for full transcription'
            return

        output = trans.transcribe(wavfile,
                                  progress_cb=on_progress,
                                  logging=logging)

        # ...remove the original upload
        os.unlink(os.path.join(outdir, 'upload'))

        # Save
        with open(os.path.join(outdir, 'align.json'), 'w') as jsfile:
            jsfile.write(output.to_json(indent=2))
        with open(os.path.join(outdir, 'align.csv'), 'w') as csvfile:
            csvfile.write(output.to_csv())
        # add file datas
        sens_end_index = trans.ms.get_sentences_index()
        res = output.to_json()
        res = json.loads(res, encoding='utf-8', strict=True)
        time_sentences_index = []
        ss_dot = 0
        s_pos = None
        time_pos = 0
        try:
            for i, w in enumerate(res['words']):
                if w["case"] != "success":
                    continue
                end_v = w['endOffset']
                start_v = w['startOffset']
                if s_pos is None:
                    s_pos = start_v
                    time_pos = i

                if end_v >= sens_end_index[ss_dot]:
                    ss_dot += 1
                    time_sentences_index.append(
                        (res['words'][time_pos]["start"],
                         res['words'][i]["end"]))
                    time_pos = i
                    s_pos = end_v
            if len(sens_end_index) != len(time_sentences_index):
                time_sentences_index.append(
                    (res['words'][time_pos]["start"], res['words'][-1]["end"]))

            #print sens_end_index, len(sens_end_index)
            #print time_sentences_index, len(time_sentences_index)
            sens_str = trans.ms.get_sentences_string()
            save_ss = ""
            for i, t in enumerate(time_sentences_index):
                #print "{{time}}%s/%s{{end}}" % (str(round(float(t[0]), 2)), str(round(float(t[1]), 2)))
                #print "{{raw}}%s{{end}}" % (str(sens_str[i]))
                save_ss += "{{time}}" + str(round(float(t[0]), 2)) + "/" + str(
                    round(float(t[1]), 2)) + "{{end}}\n"
                save_ss += "{{raw}}" + sens_str[i] + "{{end}}\n"
            with open(os.path.join(outdir, 'time.csv'), 'w') as timefile:
                timefile.write(save_ss)
        except Exception as e:
            print traceback.format_exc()

        # Inline the alignment into the index.html file.
        htmltxt = open(get_resource('www/view_alignment.html')).read()
        htmltxt = htmltxt.replace("var INLINE_JSON;",
                                  "var INLINE_JSON=%s;" % (output.to_json()))
        open(os.path.join(outdir, 'index.html'), 'w').write(htmltxt)

        status['status'] = 'OK'

        logging.info('done with transcription.')

        return output
Beispiel #29
0
        parsed_source_xml = ET.parse(source_path_xml)
        parsed_root = parsed_source_xml.getroot()
        for turn in parsed_root.findall('.//vx:Turn', namespaces):
            if 'DISCLAIMER' != turn.attrib['Speaker']:
                for fragment in turn.findall('.//vx:Fragment', namespaces):
                    target_file_txt.write(fragment.text)

    with open(target_path_txt) as target_file_txt:
        transcript = target_file_txt.read()

    source_path_mp3 = source_path_xml.replace('transcripts/extracted',
                                              'audio').replace('.xml', '.mp3')
    if os.path.isfile(source_path_mp3) and transcript:
        target_path_json = target_path_txt.replace('.txt', '.json')
        with open(target_path_json, 'w') as target_file_json:
            print('converting audio to 8K sampled wav')
            with gentle.resampled(source_path_mp3) as wavfile:
                print('starting alignment for', source_path_xml, ' and ',
                      source_path_mp3)
                aligner = gentle.ForcedAligner(resources,
                                               transcript,
                                               nthreads=nthreads,
                                               disfluency=False,
                                               conservative=False,
                                               disfluencies=disfluencies)
                result = aligner.transcribe(wavfile,
                                            progress_cb=on_progress,
                                            logging=logging)
                target_file_json.write(result.to_json(indent=2))
        print('finished alignment in', target_path_json)