コード例 #1
0
def transcribe_wav(in_fname):
    """generates transcription of a given wav file

    args:
        in_fname: file name of the wav file that should be transcribed

    returns:
        transcription of the wav file
    """
    tmp_fname1 = get_unique_fname('../tmp/extended', '.wav')
    tmp_fname2 = get_unique_fname('../tmp/transcribe', '.log')

    # prepend some silence (first bit of speech might else be treated as noise)
    subprocess.check_call(['praat', '../misc/prepend_silence.praat',
                           in_fname, tmp_fname1])

    # run pocketsphinx (printing to log so only transcript is written to stdout)
    comp_proc = subprocess.run(
        ['pocketsphinx_continuous',
         '-infile', tmp_fname1, '-logfn', tmp_fname2],
        stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)

    remove(tmp_fname1)
    remove(tmp_fname2)

    return comp_proc.stdout.decode("utf-8").replace('\n', '').replace('\r', '')
コード例 #2
0
def detect_tts_speech_rate(tts_type, voice, rate_modifier,
                           ip_addr=None, port=None):
    """determines tts average speech rate for given rate modifier

    args:
        tts_type: tts software to use; specified by one of the TTS_TYPE_*
            constants at the beginning of this module (only those tts are
            supported);
        voice: voice to be used for synthesis
        rate_modifier: rate modifier (ssml/sable) to be used for synthesis

    returns:
        mean speech rate and standard deviation, in syllables per second
    """
    syll_rates = []
    corpus = load_syllable_count_corpus()
    for line in corpus:
        if tts_type == TTS_TYPE_MARY:
            in_str = get_ssml(line[1], rate_modifier)
            input_type = INPUT_TYPE_SSML
        else:
            in_str = get_sable(line[1], rate_modifier)
            input_type = INPUT_TYPE_SABLE
        out_fname = get_unique_fname('../tmp/speech_rate', '.wav')
        try:
            synthesize(in_str, False, input_type, out_fname, tts_type,
                       ip_addr, port, voice)
        except requests.exceptions.HTTPError:
            continue
        duration = float(extract_feature_values(out_fname)['speech_duration'])
        syll_rates.append(line[0]/duration)
        remove(out_fname)
    return sum(syll_rates) / len(syll_rates), numpy.std(syll_rates)
コード例 #3
0
def extract_feature_values(in_fname):
    """runs a praat script to extract a given wav file's feature values

    args:
        in_fname: name of the wav file which should be analyzed

    returns:
        a dictionary containing several feature values, like intensity_mean

    raises:
        subprocess.CalledProcessError: script call did not return with code 0
    """
    tmp_fname = get_unique_fname('../tmp/features', '.txt')
    subprocess.check_call(['praat', '../misc/extract_features.praat',
                           in_fname, tmp_fname])

    # extract comma-separated key value pairs from output file, then delete it
    with open(tmp_fname, 'r') as out_file:
        lines = out_file.readlines()
        feat_val_dict = {}
        for line in lines:
            key, val = line.replace('\n', '').split(',')
            feat_val_dict[key] = val
    remove(tmp_fname)

    return feat_val_dict
コード例 #4
0
def main():
    """main function called if the module is run directly and not just imported
    """
    print('this is an interactive dialog system using speech input and output.'
          '\nit is based on the eliza system, which means its '
          'setting is that of rogerian psychotherapy.\nafter each output from '
          'the system, please hit enter to start recording your response and '
          'enter again to stop recording.\nyou might see some error messages '
          'even if the system works without issue in which case you can '
          'ignore them.\nhit enter now to start.')
    sys.stdin.read(1)

    tmp_fname = get_unique_fname('../tmp/%s_eliza_in', '.wav')
    in_str = 'hello, i am a psychotherapist. please tell me about your ' \
             'problems.'

    remote_tts.synthesize(in_str, out_fname=tmp_fname)

    print('me: %s' % in_str)
    play_audio(tmp_fname)
    remove(tmp_fname)

    # loop indefinitely, only stop if the user requests it
    while True:
        in_fname = get_unique_fname('../tmp/%s_eliza_in', '.wav')
        out_fname = get_unique_fname('../tmp/%s_eliza_out', '.wav')

        print('please hit enter and say your response or type "stop" to stop')
        written_input = input()
        if written_input == 'stop':
            break

        record_audio(in_fname)
        in_str = remote_tts.transcribe_wav(in_fname)
        print('you: %s' % in_str)

        out_str = generate_response(in_str).lower()
        print('me: %s' % out_str)
        remote_tts.synthesize_alike(out_str, in_fname, out_fname=out_fname)
        play_audio(out_fname)
        remove(in_fname)
        remove(out_fname)
コード例 #5
0
def synthesize_with_features(in_str, speech_rate=None, intensity=None,
                             pitch=None, in_str_is_fname=False, out_fname=None,
                             tts_type=None, ip_addr=None, port=None, voice=None,
                             speech_rates_dict=None):
    """generates wav from plain text with given speech rate, intensity and pitch

    args:
        in_str: text which should be synthesized; either directly plain text or
            the name of a file from which to read plain text
        speech_rate: target mean speech rate in syllables per second (3.0-8.0)
        intensity: target mean intensity in decibel
        pitch: target mean pitch in hertz
        speech_rates_dict: see load_speech_rates_dict(); offered as a parameter
            so it can be loaded once and reused for efficiency; loaded in this
            function if none given
        (for details on other parameters see synthesize())

    returns and raises:
        see synthesize()
    """
    # if in_str is a file name, read string to synthesize from that file
    if in_str_is_fname:
        with open(in_str, 'r') as in_file:
            in_str = ''.join(in_file.readlines())

    speech_rates_dict = (speech_rates_dict if speech_rates_dict
                         else load_speech_rates_dict())
    pitch = pitch if pitch else 'default'

    # adjust target speech rate to be within the supported range
    if speech_rate < 3.0:
        speech_rate = 3.0
    elif speech_rate > 8.0:
        speech_rate = 8.0

    # generate appropriate markup from plain text; only speech rate and pitch
    # are adjusted that way, intensity through praat (this combination is most
    # efficient and accurate)
    if not tts_type or tts_type == TTS_TYPE_MARY:
        input_type = INPUT_TYPE_SSML
        voice = voice if voice else DEFAULT_VOICE_MARY
        if speech_rate:
            rate_modifier = \
                speech_rates_dict['mary'][voice][round(speech_rate, 1)]
        else:
            rate_modifier = 'default'
        in_str = get_ssml(in_str, rate_modifier, pitch)
    elif tts_type == TTS_TYPE_FESTIVAL:
        input_type = INPUT_TYPE_SABLE
        voice = voice if voice else DEFAULT_VOICE_MARY
        if speech_rate:
            rate_modifier = \
                speech_rates_dict['festival'][voice][round(speech_rate, 1)]
        else:
            rate_modifier = 'default'
        in_str = get_sable(in_str, rate_modifier, pitch)
    else:
        raise ValueError('given tts_type not supported')

    tmp_fname = synthesize(in_str, False, input_type, None, tts_type,
                           ip_addr, port, voice)
    out_fname = out_fname if out_fname \
        else get_unique_fname('../tmp/synthesis_final', '.wav')

    adapt_wav(tmp_fname, out_fname, intensity=intensity)
    remove(tmp_fname)
    return out_fname
コード例 #6
0
def synthesize(in_str, in_str_is_fname=False, input_type=None, out_fname=None,
               tts_type=None, ip_addr=None, port=None, voice=None):
    """sends given string to a tts server and writes response to a file

    args:
        in_str: plain text or markup string for synthesis or name of a file that
            contains such a string
        in_str_is_fname: whether in_str should be treated as a file name (True)
            or directly as a string to synthesize (False, default)
        input_type: whether the input is plain text (default) or some markup;
            specified by one of the INPUT_TYPE_* constants at the beginning of
            this module
        out_fname: server response is written to this file location; if the
            return status is not ok, this contains additional info; if no name
            is given, a default will be used and returned by this function
        ip_addr: ip address of the tts server, localhost is used if none given
        port: port of the tts server, default for tts is used if none given
        tts_type: tts software to use; specified by one of the TTS_TYPE_*
            constants at the beginning of this module (only those tts are
            supported); marytts is used if none given
        voice: name of the voice to use; default for tts is used if none given

    returns:
        name of the output file, same as out_fname if that was given

    raises:
        requests.exceptions.RequestException: the connection failed or the
            server did not return an ok status
        ValueError: the given tts_type or input_type is not supported
    """
    # if in_str is a file name, read string to synthesize from that file
    if in_str_is_fname:
        with open(in_str, 'r') as in_file:
            in_str = ''.join(in_file.readlines())

    # set defaults for missing parameters
    input_type = input_type if input_type else INPUT_TYPE_TEXT
    tts_type = tts_type if tts_type else TTS_TYPE_MARY
    ip_addr = ip_addr if ip_addr else '127.0.0.1'
    out_fname = out_fname if out_fname \
        else get_unique_fname('../tmp/synthesis', '.wav')
    if tts_type == TTS_TYPE_MARY:
        port = port if port else 59125
        voice = voice if voice else DEFAULT_VOICE_MARY
    elif tts_type == TTS_TYPE_FESTIVAL:
        port = port if port else 1314
        voice = voice if voice else DEFAULT_VOICE_FESTIVAL

    # communicate with tts server in individually appropriate way
    if tts_type == TTS_TYPE_MARY:
        if input_type != INPUT_TYPE_TEXT and input_type != INPUT_TYPE_SSML:
            raise ValueError('given input_type not supported for marytts')

        params = {
            'INPUT_TEXT': in_str,
            'INPUT_TYPE': input_type,
            'OUTPUT_TYPE': 'AUDIO',
            'LOCALE': 'en_US',
            'AUDIO': 'WAVE_FILE',
            'VOICE': voice
        }
        resp = requests.post('http://%s:%d/process' % (ip_addr, port),
                             data=params, stream=True)
        with open(out_fname, 'wb') as out_file:
            for chunk in resp.iter_content(8192):
                out_file.write(chunk)
        # raise exception if http request came back with an error; do this only
        # after writing output so failure response is logged
        # TODO: write to different file (txt, not wav) and include note in msg?
        resp.raise_for_status()
    elif tts_type == TTS_TYPE_FESTIVAL:
        args = ['festival_client', '--server', ip_addr, '--port', str(port),
                '--ttw', '--otype', 'wav', '--output', out_fname]
        prolog_fname = None
        if input_type == INPUT_TYPE_TEXT:
            # for plain text input, voice must be specified in prolog file
            prolog_fname = get_unique_fname('../tmp/festival_prolog', '.wav')
            with open(prolog_fname, 'wb') as prolog_file:
                prolog_file.write(('(%s)' % voice).encode('utf-8'))
            args.append('--prolog')
            args.append(prolog_fname)
        elif input_type == INPUT_TYPE_SABLE:
            # for sable input, tts_mode option must be set
            args.append('--tts_mode')
            args.append('sable')
            # for consistent interface in this function, voice is not assumed to
            # already be specified in given sable string but set here instead
            in_str = in_str.replace('<<<voice>>>', voice)
        else:
            raise ValueError('given input_type not supported for festivaltts')

        in_fname = get_unique_fname('../tmp/festival_input')
        with open(in_fname, 'wb') as tmp_file:
            tmp_file.write(in_str.encode('utf-8'))
        args.append(in_fname)

        subprocess.check_call(args)
        remove(in_fname)
        if prolog_fname:
            remove(prolog_fname)
    else:
        raise ValueError('given tts_type not supported')

    return out_fname