Example #1
0
def run_gentle(seg, transcript):
    """
    Takes in a segment
    1. create new text file containing text
    2. create new audio with pydub
    3. run Gentle with these two
    4. delete text file/audio files

    Parameters
    ---------
    seg : Segment object to align with Gentle
    transcript : string holding the relevant transcript for this segment
    """
    audio_cut = seg.audio_file[1000 * seg.start_audio:1000 * seg.end_audio]

    audio_cut.export("temp_audio.wav", format="wav")

    # run Gentle
    resources = gentle.Resources()
    with gentle.resampled("temp_audio.wav") as wavfile:
        aligner = gentle.ForcedAligner(resources, transcript)
        result = aligner.transcribe(wavfile).words

    # delete cut audio file
    os.remove("temp_audio.wav")

    # fix unaligned-word start/end time data
    fix_unaligned(result, len(audio_cut) / 1000)

    # put gentle timestamps in relation to entire file
    for word in result:
        word.start += seg.start_audio
        word.end += seg.start_audio

    return result
Example #2
0
def serve(port=8765, interface='0.0.0.0', installSignalHandlers=0, nthreads=4, ntranscriptionthreads=2, data_dir=get_datadir('webdata'), modelDir='exp'):
    logging.info("SERVE %d, %s, %d", port, interface, installSignalHandlers)

    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    zip_dir = os.path.join(data_dir, 'zip')
    if not os.path.exists(zip_dir):
        os.makedirs(zip_dir)

    f = File(data_dir)

    f.putChild('', File(get_resource('www/index.html')))
    f.putChild('status.html', File(get_resource('www/status.html')))
    f.putChild('preloader.gif', File(get_resource('www/preloader.gif')))
   
    resources = gentle.Resources(modelDir)
    trans = Transcriber(data_dir, nthreads=nthreads, ntranscriptionthreads=ntranscriptionthreads, modelDir=modelDir)
    config = trans.config
    logging.info("CONFIG: samplerate %d, silencephones %s, context-width %s", config['samplerate'], config['silencephones'], config['context-width'])
    trans_ctrl = TranscriptionsController(trans)
    f.putChild('transcriptions', trans_ctrl)

    trans_zippr = TranscriptionZipper(zip_dir, trans)
    f.putChild('zip', trans_zippr)

    s = Site(f)
    logging.info("about to listen")
    reactor.listenTCP(port, s, interface=interface)
    logging.info("listening")

    reactor.run(installSignalHandlers=installSignalHandlers)
Example #3
0
    def run_gentle(audio_path: str,
                   text_content: str,
                   tokenization_view: View = None):

        with gentle.resampled(audio_path) as audio_file:
            resources = gentle.Resources()
            aligner = gentle.ForcedAligner(
                resources,
                text_content,
                nthreads=multiprocessing.cpu_count(),
                disfluencies={'uh', 'um'},
                disfluency=True,
                conservative=False)
            if tokenization_view is not None:
                aligner.ms._seq = []
                for token in tokenization_view.get_annotations(Uri.TOKEN):
                    print(token.serialize(pretty=True))
                    start = token.properties['start']
                    end = token.properties['end']
                    token_text = text_content[start:end]
                    kaldi_token = {
                        'start':
                        start,
                        'end':
                        end,
                        'token':
                        metasentence.kaldi_normalize(token_text,
                                                     aligner.ms.vocab)
                    }
                    aligner.ms._seq.append(kaldi_token)
            result = aligner.transcribe(audio_file)
            return result
Example #4
0
    def get_gentle_response(self, parsed_txt_path):
        """Returns response from gentle
    
        Args:
            parsed_txt_path (str): parsed txt path

        Returns:
            list: aligned words
        """

        with open(parsed_txt_path, encoding="utf-8") as fh:
            transcript = fh.read()

        resources = gentle.Resources()
        # words for gentle to ignore when aligning
        disfluencies = set(['uh', 'um'])
        with gentle.resampled(self.audiopath) as wavfile:
            aligner = gentle.ForcedAligner(
                resources,
                transcript,
                nthreads=multiprocessing.cpu_count(),
                disfluency=False,
                conservative=False,
                disfluencies=disfluencies)
            result = aligner.transcribe(wavfile)

        return [word.as_dict(without="duration") for word in result.words]
Example #5
0
    def __init__(self, data_dir, nthreads=4, ntranscriptionthreads=2):
        self.data_dir = data_dir
        self.nthreads = nthreads
        self.ntranscriptionthreads = ntranscriptionthreads
        self.resources = gentle.Resources()

        self.full_transcriber = gentle.FullTranscriber(self.resources, nthreads=ntranscriptionthreads)
        self._status_dicts = {}
Example #6
0
def align(audiopath, text, nthreads=1):
    resources = gentle.Resources()
    logging.info("converting audio to 8K sampled wav")
    with gentle.resampled(audiopath) as wavfile:
        logging.info("Starting alignment")
        aligner = gentle.ForcedAligner(resources, text, nthreads=nthreads, disfluency=False, 
                                        conservative=False)
        return aligner.transcribe(wavfile, progress_cb=on_progress, logging=logging)    
Example #7
0
    def __init__(self, data_dir, nthreads=4, ntranscriptionthreads=2, modelDir='exp'):
        self.data_dir = data_dir
        self.nthreads = nthreads
        self.ntranscriptionthreads = ntranscriptionthreads
        self.resources = gentle.Resources(modelDir)
        self.config = self.resources.getConfig()

        self.full_transcriber = gentle.FullTranscriber(self.resources, nthreads=ntranscriptionthreads)
        self._status_dicts = {}
Example #8
0
def align(audiopath, transcript):
    import gentle
    global resources
    if resources is None:
        resources = gentle.Resources()
    logging.info("converting audio to 8K sampled wav")
    with gentle.resampled(audiopath) as wavfile:
        logging.info("starting alignment")
        aligner = gentle.ForcedAligner(resources,
                                       transcript,
                                       nthreads=nthreads,
                                       disfluency=False,
                                       conservative=False)
        return json.loads(
            aliger.transcribe(wavfile,
                              progress_cb=on_progress,
                              logging=logging).to_json())
Example #9
0
def call_gentle_chunk(wav_path,
                      transcript,
                      disfluency=False,
                      conservative=False):
    """"""
    resources = gentle.Resources()

    aligner = gentle.ForcedAligner(resources,
                                   transcript,
                                   nthreads=multiprocessing.cpu_count(),
                                   disfluency=disfluency,
                                   conservative=conservative,
                                   disfluencies=('uh', 'um'))

    result = aligner.transcribe(wav_path,
                                progress_cb=_on_progress,
                                logging=logging)

    return json.loads(result.to_json())
Example #10
0
def get_transcribed_words(textFile, audioFile):
    with open(textFile) as file:
        transcript = file.read()

    resources = gentle.Resources()

    with gentle.resampled(audioFile) as wavfile:
        aligner = gentle.ForcedAligner(resources, transcript)
        result = aligner.transcribe(wavfile)

    transcribed_words = []
    for word in result.words:
        phones = word.phones
        if phones is not None:
            root_phones = []
            for phone in phones:
                root_phone = phone['phone'][0:phone['phone'].index('_')]
                root_phones.append(root_phone)
            transcribed_words.append(Word(word.word, root_phones))

    return transcribed_words
 def gentle_solve(self, audio_path, transcript):
     """
     gentle wrapper to solve the forced alignment given audio file and text string 
     """
     args = {
         'log': 'INFO',
         'nthreads': self.num_thread,
         'conservative': True,
         'disfluency': True,
     }
     disfluencies = set(['uh', 'um'])
     resources = gentle.Resources()
     with gentle.resampled(audio_path) as wavfile:
         aligner = gentle.ForcedAligner(
             resources,
             transcript,
             nthreads=args['nthreads'],
             disfluency=args['disfluency'],
             conservative=args['conservative'],
             disfluencies=disfluencies)
         result = aligner.transcribe(wavfile)
     return [word.as_dict(without="phones") for word in result.words]
def start_aligning(audiofile, txtfile, output):
    log_level = "INFO"  #can be one of the following: (DEBUG, INFO, WARNING, ERROR, or CRITICAL)
    logging.getLogger().setLevel(log_level)

    disfluencies = set(['uh', 'um'])

    with open(txtfile, encoding="utf-8") as fh:
        transcript = fh.read()

    resources = gentle.Resources()
    logging.info("converting audio to 8K sampled wav")

    with gentle.resampled(audiofile) as wavfile:
        logging.info("starting alignment")
        aligner = gentle.ForcedAligner(
            resources, transcript, nthreads
        )  #, True, False, disfluencies)#, conservative, disfluencies)
        result = aligner.transcribe(wavfile,
                                    progress_cb=on_progress,
                                    logging=logging)

    fh = open(output, 'w', encoding="utf-8")
    fh.write(result.to_json(indent=2))
    logging.info("output written to %s" % (output))
Example #13
0
import json
import multiprocessing as mp
import logging
try:
    import gentle
except ModuleNotFoundError:
    import sys
    sys.path.append('/roaming/gentle')
    import gentle

resources = gentle.Resources()


def on_progress(p):
    for k, v in p.items():
        logging.debug("%s: %s" % (k, v))


def align_many(audiopaths, transcripts):
    with mp.Pool(mp.cpu_count()) as pool:
        result = pool.map(align, zip(audiopaths, transcripts))
    return result


def align(args):
    audiopath, transcript = args
    with gentle.resampled(audiopath) as wavfile:
        logging.info("Audio file: {}".format(audiopath))
        logging.info("Transcript: <{}...>".format(transcript[:40]))
        aligner = gentle.ForcedAligner(resources,
                                       transcript,
import json
import logging
import multiprocessing
import os

import gentle
import scipy.io.wavfile as sciwav

DISFLUENCIES = {'uh', 'um'}  # set of disfluencies
RESOURCES = gentle.Resources()
N_THREADS = multiprocessing.cpu_count()

logging.getLogger().setLevel("INFO")


def _on_progress(p):
    for k, v in p.items():
        logging.debug("%s: %s" % (k, v))


def _get_key_val_pair(line):
    line_split = line[:-1].split()
    word = line_split[0]
    if word[-1] == ')':
        word = word.split('(')[0]

    word = word.lower()
    key = [word]
    val = []
    for phoneme in line_split[1:]:
        val.append(phoneme.lower())
Example #15
0
def data_generator(file_id, min_dur=2, max_dur=(5, 20), randomize=False):
    """Given a file id and random seed, align the audio and text versions after
    dividing into single-speaker utterances, and write out texts of unbroken
    captured strings and their corresponding audio segments when the latter are
    between 2 and max_length seconds.
    """

    if randomize:
        seed = ord(file_id[-1])
        random.seed(seed)
        max_length = random.randint(max_dur[0], max_dur[1])
    else:
        max_length = max_dur[1]

    logger.info("Processing file id {}...".format(file_id))

    # grab audio file from s3
    mp3 = os.path.join(mp3_dir, "{}.mp3".format(file_id))
    wav = os.path.join(mp3_dir, "{}.wav".format(file_id))

    if not os.path.isfile(wav):
        if not os.path.isfile(mp3):
            bucket = boto3.resource("s3").Bucket("cgws")
            logger.info("Downloading file {} from S3...".format(file_id))
            try:
                bucket.download_file("{}.mp3".format(file_id), mp3)
            except:
                logger.warning(
                    "Could not download file {} from S3.".format(file_id))
                return

        FNULL = open(os.devnull, 'w')
        subprocess.call([
            "sox", "{}".format(mp3), "-r", "16k", "{}".format(wav), "remix",
            "-"
        ],
                        stdout=FNULL,
                        stderr=FNULL)

    # transcript
    txt_file = os.path.join(records_dir, "{}.txt".format(file_id))
    logger.info("Reading transcript {}...".format(file_id))
    try:
        with open(txt_file, "r") as tr:
            transcript = tr.read()
    except IOError:
        logger.warning("File {} does not exist.".format(txt_file))
        return

    # split transcript by speaker, and get timestamps (as seconds)
    # of the boundaries of each paragraph
    logger.info("Splitting transcript by speaker...")
    paragraphs = []
    times = []
    for paragraph in transcript.split("\n"):
        catch = re.match("\d:\d+:\d+\.\d", paragraph)
        if catch:
            timestamp = catch.group()
            h, m, s = timestamp.split(":")
            time = int(h) * 60 * 60 + int(m) * 60 + float(s)
            paragraphs.append(paragraph)
            times.append(time)
    file_end = get_duration(mp3)
    times.append(file_end)

    total_captures, captures_dur = 0, 0

    # taking one speaker at a time, find unbroken alignments up to max_length
    # and write out corresponding files
    for i, paragraph in enumerate(paragraphs):
        logger.info("Cleaning and trimming paragraph {}: \n{}".format(
            i, paragraph))

        paragraph_start, paragraph_end = times[i], times[i + 1]
        # don't bother with short files
        if paragraph_end - paragraph_start < min_dur:
            logger.info("Skipping paragraph {} (too short)...".format(i))
            continue
        if len(paragraph.split()) < 2:
            logger.info("Skipping paragraph {} (too few words)...".format(i))
            continue

        temp_wav = trim(file_id, wav, paragraph_start, paragraph_end, 0,
                        "/tmp")

        # unique name of json object to read/write
        paragraph_hash = hashlib.sha1("{}{}{}{}".format(
            file_id, paragraph, paragraph_start, paragraph_end)).hexdigest()

        if use_filename_json is True:
            json_file = os.path.join(
                json_out_dir, "{}_{}_{}.json".format(file_id, paragraph_start,
                                                     paragraph_end))
        else:
            json_file = os.path.join(json_out_dir,
                                     "{}.json".format(paragraph_hash))

        result = None

        # check if json object has been written from a previous run
        if not os.path.isfile(json_file):
            logger.info(
                "JSON file with hash {} not found.".format(paragraph_hash))

            try:
                logger.info("Resampling paragraph {}...".format(i))
                with gentle.resampled(temp_wav) as wav_file:
                    resources = gentle.Resources()
                    cleaned = clean(paragraph)
                    logger.info(
                        "Aligning paragraph {} with gentle...".format(i))
                    aligner = gentle.ForcedAligner(
                        resources,
                        cleaned,
                        nthreads=multiprocessing.cpu_count(),
                        disfluency=False,
                        conservative=False,
                        disfluencies=set(["uh", "um"]))
                    logger.info(
                        "Transcribing audio segment {} with gentle...".format(
                            i))
                    result = aligner.transcribe(wav_file)
            except:
                logger.warning("Paragraph {} - {} ".format(
                    i,
                    sys.exc_info()[2]))
                os.remove(temp_wav)
                continue

            aligned_words = result.to_json()
            with open(json_file, "w") as f:
                f.write(aligned_words)

            if not result:
                logger.info("Empty result for paragraph {}.".format(i))
                os.remove(temp_wav)
                continue

        else:
            logger.info(
                "Found JSON of paragraph {} -- skipping alignment and transcription by gentle"
                .format(i))

        # dictionary of aligned words
        with open(json_file) as f:
            aligned = json.loads(f.read())

        # save all consecutively captured strings
        # and keep track of their start and stop times
        captures = []
        current, start_time, end_time = [], 0, 0

        # loop through every word as returned from gentle
        logger.info("Capturing strings in paragraph {}...".format(i))

        if not "words" in aligned:
            logger.info("No words in paragraph {}.".format(i))
            os.remove(temp_wav)
            continue

        # first two seconds will be skipped even if it contains a capture
        for catch in aligned["words"]:
            # successful capture
            if catch["case"] == "success" and catch[
                    "alignedWord"] != "<unk>" and catch[
                        'start'] > 5 and catch['end'] - catch['start'] > .07:

                # new capture group
                if not current:
                    # begin capturing if it has been two seconds since the last word
                    if catch["start"] - end_time > 1:
                        current = [catch["alignedWord"]]
                        start_time = catch["start"]
                        end_time = catch["end"]

                # continuation of a capture group
                else:
                    # large gap between last capture and this one
                    # likely that something was missing in the transcript
                    if catch["start"] - end_time > 1:
                        save_capture(captures, start_time, end_time, current)
                        current = []

                    # adding this word would equal or exceed max_length
                    elif catch["end"] - start_time >= max_length:
                        save_capture(captures, start_time, end_time, current,
                                     min_dur)
                        current = []
                        if randomize:
                            max_length = random.randint(max_dur[0], max_dur[1])

                    # continue capturing
                    else:
                        current.append(catch["alignedWord"])
                        end_time = catch["end"]

            # a miss after prior success(es)
            elif current:
                save_capture(captures, start_time, end_time, current, min_dur)
                current = []

        # last word was a success but current capture hasn't been saved yet
        if current:
            save_capture(captures, start_time, end_time, current, min_dur)

        # write strings and split audio into consituent segments
        logger.info(
            "Writing text and audio segments from paragraph {}...".format(i))
        for result in captures:
            txt_segment = os.path.join(
                text_out_dir, "{}_{}_{}.txt".format(
                    file_id,
                    "{:07d}".format(int((times[i] + result["start"]) * 100)),
                    "{:07d}".format(int((times[i] + result["end"]) * 100))))
            with open(txt_segment, "w") as f:
                f.write("{}\n".format(result["string"]))

            segment = trim(file_id, temp_wav, result["start"], result["end"],
                           times[i], wav_out_dir)
            # make sure durations match
            segment_dur = get_duration(segment)
            assert segment_dur - result["duration"] <= .01

            total_captures += 1
            captures_dur += segment_dur

        # delete the clip of this speaker
        os.remove(temp_wav)

    # per-file logging
    total_dur = get_duration(mp3)
    logger.info("Wrote {} segments from {}, totalling {} seconds, out of a possible {}, ratio {:.2f}."\
          .format(total_captures,file_id,captures_dur,total_dur,captures_dur/total_dur))

    return
Example #16
0
 def __init__(self, nthreads=4, ntranscriptionthreads=2):
     self.nthreads = nthreads
     self.ntranscriptionthreads = ntranscriptionthreads
     self.resources = gentle.Resources()
Example #17
0
def generate_diphones(audio_file,
                      transcript_file,
                      output_folder,
                      pre_padding=0.0,
                      post_padding=0.0) -> set:
    """Generates the list of diphones for a given audio_file using the transcript and store the diphones in the
    output_folder
    Args:
        :param audio_file:(str) Name of the audio file to segment (.wav)
        :param transcript_file:(str) Name of the text file with the transcript
        :param output_folder:(str) Name of the destination directory to store the diphones
        :param pre_padding:(float) A fraction of audio to clip before the generated diphone
        :param post_padding:(float) A fraction of audio to clip after the generated diphone
    Returns:
        :return set of generated diphones
    """
    nthreads = multiprocessing.cpu_count()
    disfluency = False
    conservative = False
    disfluencies = {'uh', 'um'}

    with open(transcript_file, encoding="utf-8") as fh:
        transcript = fh.read()
        print(transcript)
    resources = gentle.Resources()

    with gentle.resampled(audio_file) as wavfile:
        aligner = gentle.ForcedAligner(resources,
                                       transcript,
                                       nthreads=nthreads,
                                       disfluency=disfluency,
                                       conservative=conservative,
                                       disfluencies=disfluencies)
        result = aligner.transcribe(wavfile)
        r = json.loads(result.to_json())

    phone_time_list = []
    diphones = set()
    for word in r['words']:
        start = word['start'] * 1000
        for phone in word['phones']:
            diphones.add(phone['phone'])
            phone_time_list.append(
                [phone['phone'], start, start + phone['duration'] * 1000])
            start = start + phone['duration'] * 1000

    for entry in phone_time_list:
        diphone = segment_audio(audio_file, entry[1], entry[2], pre_padding,
                                post_padding)
        # print('Old ' + str(entry[0]) + ':' + str(len(diphone)))
        if len(diphone) < 150:
            try:
                diphone = ensure_length(diphone, 150)
            except exceptions.CouldntDecodeError:
                print(
                    entry[0],
                    'is very small.........................................................'
                )
        if not os.path.exists(output_folder):
            os.mkdir(output_folder)
        output_filename = output_folder + '/' + str(entry[0]) + '.wav'
        diphone.export(output_filename, format='wav')
        print('New ' + str(entry[0]) + ':' + str(len(diphone)))
    return diphones
Example #18
0
log_level = args.log.upper()
logging.getLogger().setLevel(log_level)

disfluencies = set(['uh', 'um'])


def on_progress(p):
    for k, v in p.items():
        logging.debug("%s: %s" % (k, v))


with open(args.txtfile) as fh:
    transcript = fh.read()

resources = gentle.Resources(args.model_dir)
config = resources.getConfig()
logging.info("converting audio to {} sampled wav".format(config['samplerate']))

with gentle.resampled(args.audiofile) as wavfile:
    logging.info("starting alignment")
    aligner = gentle.ForcedAligner(resources,
                                   transcript,
                                   nthreads=args.nthreads,
                                   context_width=config['context-width'],
                                   disfluency=args.disfluency,
                                   conservative=args.conservative,
                                   disfluencies=disfluencies)
    result = aligner.transcribe(wavfile,
                                progress_cb=on_progress,
                                logging=logging)