Ejemplo n.º 1
0
    def __init__(self, samplingFrequency=8000, framePeriod=25e-3, hopPeriod=10e-3, trainDir="./train_audio/",
                 thresh=1.2):
        self.samplingFrequency = samplingFrequency
        self.framePeriod = framePeriod
        self.hopPeriod = hopPeriod
        self.trainDir = trainDir
        self.hopLength = int(samplingFrequency * hopPeriod)
        self.frameLength = int(samplingFrequency * framePeriod)
        self.referenceMFCC = []
        for file_name in os.listdir(trainDir):
            if file_name.endswith(".wav"):
                (fs, data) = wv.read(trainDir + file_name)
                num_frames = int(data.shape[0] / self.hopLength) - int(np.ceil(self.frameLength / self.hopLength))

                MFCC_calculator = mfcc.MFCC()
                MFCC_MATRIX = np.empty([39, abs(num_frames)])
                for k in range(num_frames):
                    MFCC_MATRIX[:, k] = MFCC_calculator.compute_mfcc(
                        data[k * self.hopLength: k * self.hopLength + self.frameLength])
                self.referenceMFCC.append(MFCC_MATRIX)

        DTW_calculator = dtw.DTW()
        distance_list = [DTW_calculator.compute_distance(np.transpose(matrix), np.transpose(matrix2)) for matrix in
                         self.referenceMFCC for matrix2 in self.referenceMFCC]
        self.thresh = np.mean(np.array((distance_list))) * thresh
Ejemplo n.º 2
0
    def distance(self, fileName):
        """

        This function is used for calculating the DTW distance between the test utterance and each of the 10 training utterances.

        :param fileName: Name of test utterance .wav file
        :type fileName: str
        :returns: List of DTW distances of test utterance with each training utterance
        :rtype: list

        """
        if isinstance(fileName, (bytes, bytearray)):
            data = np.fromstring(fileName)
        else:
            (fs, data) = wv.read(fileName)

        DTW_calculator = dtw.DTW()
        num_frames = int(data.shape[0] / self.hopLength) - int(np.ceil(self.frameLength / self.hopLength))
        if num_frames <= 0:
            return 10000

        MFCC_calculator = mfcc.MFCC()
        MFCC_MATRIX = abs(np.empty([39, num_frames]))
        for k in range(num_frames):
            MFCC_MATRIX[:, k] = MFCC_calculator.compute_mfcc(
                data[k * self.hopLength: k * self.hopLength + self.frameLength])

        distance_list = [DTW_calculator.compute_distance(np.transpose(matrix), np.transpose(MFCC_MATRIX)) for matrix in
                         self.referenceMFCC]
        return distance_list
Ejemplo n.º 3
0
def main(args):
    words = []
    wordstoindex = {}
    mfcc_convert = mfcc.MFCC()
    mfcc_savepath_prefix = os.path.splitext(args.mfccs)[0]
    num_frames = 0
    h5file = h5py.File(args.mfccs, "w")
    for line_id, line in enumerate(open(args.wordlist, "r").readlines()):
        word, path, start, end = line.strip().split()
        if not wordstoindex.has_key(word):
            wordstoindex[word] = len(words)
            words.append(word)
        class_id = wordstoindex[word]
        start, end = int(start), int(end)
        command = ("sox -t sph {0} -r 16000 -t wav {1}"
                   ).format(path, "tmp.wav")
        subprocess.call(command, shell=True)
        sample_rate, samples = wavfile.read("tmp.wav")
        samples = samples.astype(float)
        full_mfccs = mfcc_convert.sig2s2mfc(samples)
        cutoff = numpy.sort(full_mfccs[:, 0])
        speech_areas = full_mfccs[full_mfccs[:, 0] > cutoff]
        cepstral_mean = speech_areas.mean(axis=0)
        cepstral_variance = speech_areas.std(axis=0)
        word_samples = samples[start - 480: end + 480]
        word_mfccs = ((mfcc_convert.sig2s2mfc(word_samples) - cepstral_mean)
                      / cepstral_variance)
        n_frames, n_dims = word_mfccs.shape
        full_mfccs = numpy.empty((n_frames, n_dims * 3), dtype=numpy.float32)
        full_mfccs[:,:n_dims] = word_mfccs
        mfcc.deltas(word_mfccs, output_frames=full_mfccs[:,n_dims:2*n_dims])
        mfcc.deltas(full_mfccs[:,n_dims:2*n_dims],
                    output_frames=full_mfccs[:,2*n_dims:])
        if line_id == 0:
            mfcc_dset = h5file.create_dataset("mfccs", (n_frames, 39), maxshape=(None, 39), dtype=numpy.float32)
            label_dset = h5file.create_dataset("labels", (100, 3), maxshape=(None, 3), dtype=numpy.int32)
            data_idx = 0
            cur_idx = 0
        if cur_idx + n_frames >= len(mfcc_dset):
            h5file.flush()
            mfcc_dset.resize((2*len(mfcc_dset), mfcc_dset.shape[1]))
            print "mfcc doubling", cur_idx
        if data_idx == len(label_dset):
            h5file.flush()
            label_dset.resize((2*label_dset.shape[0], label_dset.shape[1]))
        try:
            mfcc_dset[cur_idx:cur_idx+n_frames] = full_mfccs
        except:
            import pdb; pdb.set_trace()
        label_dset[data_idx] = class_id, cur_idx, n_frames
        cur_idx += n_frames
        data_idx += 1
    mfcc_dset.resize((cur_idx, mfcc_dset.shape[1]))
    label_dset.resize((data_idx, label_dset.shape[1]))
    h5file.flush()
    words.sort()
    open(args.wordkey, "w").write("\n".join(
        "%s %d" % (w, wordstoindex[w]) for w in words))
Ejemplo n.º 4
0
def process_one_file(wav_base_name):
    wavefilename = wavdir + "/" + wav_base_name + ".wav"
    print wavefilename
    fh = wave.open(wavefilename, "r")
    sampwidth = fh.getsampwidth()
    print fh.getparams()
    nsamples = fh.getnframes()
    bytes = fh.readframes(nsamples)
    code = "h"
    samples = array.array(code, bytes)
    samples = np.array(samples)
    fh.close()
    print len(samples)
    print samples[12000:12020]
    mfcc_processor = mfcc.MFCC()
    feats = mfcc_processor.sig2s2mfc(samples)
    print len(feats)
    print feats[10:12]
    featfilename = featdir + "/" + wav_base_name + ".feat"
    feats.dump(featfilename)
    checkfeats = np.load(featfilename)
    assert (feats == checkfeats).all()