import logging import math import os import shutil import subprocess import sys import tempfile from paths import get_binary from metasentence import MetaSentence MKGRAPH_PATH = get_binary("ext/mkgraph") def make_bigram_lm_fst(word_sequences, **kwargs): ''' Use the given token sequence to make a bigram language model in OpenFST plain text format. When the "conservative" flag is set, an [oov] is interleaved between successive words. When the "disfluency" flag is set, a small set of disfluencies is interleaved between successive words `Word sequence` is a list of lists, each valid as a start ''' if len(word_sequences) == 0 or type(word_sequences[0]) != list: word_sequences = [word_sequences] conservative = kwargs['conservative'] if 'conservative' in kwargs else False
import subprocess from paths import get_binary FFMPEG = get_binary("ffmpeg") def to_wav(infile, outfile): ''' Use FFMPEG to convert a media file to a wav file ''' return subprocess.call([ FFMPEG, '-loglevel', 'panic', '-y', '-i', infile, '-ac', '1', '-ar', '8000', '-acodec', 'pcm_s16le', outfile ])
import subprocess from paths import get_binary FFMPEG = get_binary("ffmpeg") def to_wav(path, R=8000, depth=16, nchannels=1, start=0): ''' Use FFMPEG to convert a media file to a wav file with the given sample format. Returns an IO object so the results can be streamed. ''' cmd = [FFMPEG, '-ss', "%f" % (start), '-i', path, '-loglevel', 'panic', '-ss', "%f" % (start), '-vn', '-ar', str(R), '-ac', str(nchannels), '-f', 'wav', '-acodec', 'pcm_s16le', '-'] p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=open('/dev/null', 'w')) return p.stdout
import logging import math import os import shutil import subprocess import sys import tempfile from paths import get_binary from metasentence import MetaSentence MKGRAPH_PATH = get_binary("ext/mkgraph") def make_bigram_lm_fst(word_sequence): ''' Use the given token sequence to make a bigram language model in OpenFST plain text format. ''' word_sequence = ['[oov]', '[oov]'] + word_sequence + ['[oov]'] bigrams = {} prev_word = word_sequence[0] for word in word_sequence[1:]: bigrams.setdefault(prev_word, set()).add(word) prev_word = word node_ids = {} def get_node_id(word): node_id = node_ids.get(word, len(node_ids) + 1) node_ids[word] = node_id return node_id