Ejemplo n.º 1
0
def compute_cmvn(featdir):
    #read the spk2utt file
    spk2utt = open(featdir + '/spk2utt', 'r')

    #create feature reader
    reader = ark.ArkReader(featdir + '/feats.scp')

    #create writer for cmvn stats
    writer = ark.ArkWriter(featdir + '/cmvn.scp')

    #loop over speakers
    for line in spk2utt:
        #cut off end of line character
        line = line[0:len(line) - 1]

        split = line.split(' ')

        #get first speaker utterance
        spk_data = reader.read_utt(split[1])

        #get the rest of the utterances
        for utt_id in split[2:len(split)]:
            spk_data = np.append(spk_data, reader.read_utt(utt_id), axis=0)

        #compute mean and variance
        stats = np.zeros([2, spk_data.shape[1] + 1])
        stats[0, 0:spk_data.shape[1]] = np.sum(spk_data, 0)
        stats[1, 0:spk_data.shape[1]] = np.sum(np.square(spk_data), 0)
        stats[0, spk_data.shape[1]] = spk_data.shape[0]

        #write stats to file
        writer.write_next_utt(featdir + '/cmvn.ark', split[0], stats)

    writer.close()
    def decode(self, likelihood, lengths, trans=None):
        # Input:
        # likelihood: N * L * P numpy array
        #             N : number of utterance
        #             L : maximum number of frames of one utterance
        #             P : dimension of phone posterior (after transform)
        #                 The order should follow lang/phones.txt excepts <eps>,
        #                 #0~#4
        # length : 1d numpy array of N lengths of utterances
        # trans : Transcription of N utterances, which is a list of N string. If
        #         None, skip scoring

        # Write posterior to feats.scp (In order to call split_data.sh)
        writer = ark.ArkWriter(
            os.path.join(self.posterior_dir, 'feats.scp'),
            os.path.join(self.posterior_dir, 'likelihoods.ark'))
        N = likelihood.shape[0]
        n_digits = len(str(N))

        for idx, (output, l) in enumerate(zip(likelihood, lengths)):
            output = output[:l, :]
            output = np.where(output == 0, np.finfo(float).eps, output)
            output = np.ascontiguousarray(output, dtype=np.float32)
            writer.write_next_utt(
                str(self._number2str(idx, n_digits)).encode('utf-8'),
                np.log(output))
        writer.close()

        self._gen_utt2spk(N)
        if trans:
            scoring_cmd = 'false'
            self._write_trans(trans)
        else:
            scoring_cmd = 'true'

        os.system(
            '%s/scripts/decode.sh --cmd run.pl --skip_scoring %s --nj %s %s %s %s | tee %s/decode.log || exit 1;'
            % (os.getcwd(), scoring_cmd, self.nj, self.graph_dir,
               self.posterior_dir, self.decode_dir, self.decode_dir))

        # Get best WER and print it
        wer = os.popen('grep WER %s/wer_* | utils/best_wer.sh' %
                       self.decode_dir).read()
        print(wer)

        _, lmwt, penalty = wer[wer.find('wer'):].rstrip().split('_')
        output_path = os.path.join(
            self.decode_dir,
            'scoring_kaldi/penalty_{}/{}.txt'.format(penalty, lmwt))
        copy_path = os.path.join(self.decode_dir, 'output.txt')

        os.system("cat {} | sort > {}".format(output_path, copy_path))
        print(
            "The result file of decoding corrsponding to the lowest WER is in: {}\n"
            .format(copy_path))
Ejemplo n.º 3
0
    def decode(self, featdir, decodedir):
        #create a feature reader
        reader = batchdispenser.FeatureReader(featdir + '/feats.scp',
                                              featdir + '/cmvn.scp',
                                              featdir + '/utt2spk',
                                              int(self.conf['context_width']))

        #remove ark file if it allready exists
        if os.path.isfile(decodedir + '/feats.ark'):
            os.remove(decodedir + '/feats.ark')

        #open likelihood writer
        writer = ark.ArkWriter(decodedir + '/feats.scp')

        #create a decoder
        decoder = nnetgraph.NnetDecoder(self.DNN, self.input_dim)

        #read the prior
        prior = np.load(self.conf['savedir'] + '/prior.npy')

        #start tensorflow session
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        with tf.Session(graph=decoder.graph, config=config) as session:

            #load the model
            decoder.restore(self.conf['savedir'] + '/final')

            #feed the utterances one by one to the neural net
            while True:
                utt_id, utt_mat, looped = reader.getUtt()

                if looped:
                    break

                #compute predictions
                output = decoder(utt_mat)

                #get state likelihoods by dividing by the prior
                output = output / prior

                #floor the values to avoid problems with log
                np.where(output == 0, np.finfo(float).eps, output)

                #write the pseudo-likelihoods in kaldi feature format
                writer.write_next_utt(decodedir + '/feats.ark', utt_id,
                                      np.log(output))

        #close the writer
        writer.close()
Ejemplo n.º 4
0
def prepare_data(datadir, featdir, conf, feat_type, dynamic):

    if not os.path.exists(featdir):
        os.makedirs(featdir)

    #read the segments
    if os.path.isfile(datadir + '/segments'):
        segments = kaldiInterface.read_segments(datadir + '/segments')
        found_segments = True
    else:
        print(
            'WARNING: no segments file found, assuming each wav file is seperate utterance'
        )
        found_segments = False

    #read the wavfiles
    wavfiles = kaldiInterface.read_wavfiles(datadir + '/wav.scp')

    #create ark writer
    writer = ark.ArkWriter(featdir + '/feats.scp')
    if os.path.isfile(featdir + '/feats.ark'):
        os.remove(featdir + '/feats.ark')

    #read all the wav files
    RateUtt = {utt: read_wav(wavfiles[utt]) for utt in wavfiles}

    #create a featureComputer
    comp = feat.FeatureComputer(feat_type, dynamic, conf)

    #compute all the features
    for utt in wavfiles:
        if found_segments:
            for seg in segments[utt]:
                features = comp(
                    RateUtt[utt][1][int(seg[1] *
                                        RateUtt[utt][0]):int(seg[2] *
                                                             RateUtt[utt][0])],
                    RateUtt[utt][0])
                writer.write_next_utt(featdir + '/feats.ark', seg[0], features)
        else:
            features = comp(RateUtt[utt][1], RateUtt[utt][0])
            writer.write_next_utt(featdir + '/feats.ark', utt, features)

    writer.close()

    #copy some kaldi files to features dir
    copyfile(datadir + '/utt2spk', featdir + '/utt2spk')
    copyfile(datadir + '/spk2utt', featdir + '/spk2utt')
    copyfile(datadir + '/text', featdir + '/text')
    copyfile(datadir + '/wav.scp', featdir + '/wav.scp')
Ejemplo n.º 5
0
def prepare_data(datadir, featdir, conf, feat_type, dynamic):
    '''
    compute the features of all segments and save them on disk

    Args:
        datadir: directory where the kaldi data prep has been done
        featdir: directory where the features will be put
        conf: feature configuration
        featureType: string containing the type of features, optione are:
            fbank, mfcc and ssc.
        dynamic: the type of dynamic information added, options are:
            nodelta, delta and ddelta.
    '''

    if not os.path.exists(featdir):
        os.makedirs(featdir)

    #read the segments
    if os.path.isfile(datadir + '/segments'):
        segments = readfiles.read_segments(datadir + '/segments')
        found_segments = True
    else:
        print( "WARNING: no segments file found, assuming each wav file is seperate utterance")
        found_segments = False

    #create ark writer
    if os.path.isfile(featdir + '/feats.ark'):
        os.remove(featdir + '/feats.ark')
    writer = ark.ArkWriter(featdir + '/feats.scp', featdir + '/feats.ark')

    #read the wavfiles
    wavfiles = readfiles.read_wavfiles(datadir + '/wav.scp')

    #read all the wav files
    rate_utt = {utt: read_wav(wavfiles[utt]) for utt in wavfiles}

    #create a featureComputer
    comp = feat.FeatureComputer(feat_type, dynamic, conf)

    #compute all the features
    max_length = 0
    for utt in wavfiles:
        if found_segments:
            for seg in segments[utt]:
                features = comp(
                    rate_utt[utt][1][int(seg[1]*rate_utt[utt][0]):
                                     int(seg[2]*rate_utt[utt][0])],
                    rate_utt[utt][0])

                writer.write_next_utt(featdir + '/feats.ark', seg[0], features)
                max_length = max(max_length, features.shape[0])
        else:
            features = comp(rate_utt[utt][1], rate_utt[utt][0])
            writer.write_next_utt(utt, features)
            max_length = max(max_length, features.shape[0])

    writer.close()

    #copy some kaldi files to features dir
    copyfile(datadir + '/utt2spk', featdir + '/utt2spk')
    copyfile(datadir + '/spk2utt', featdir + '/spk2utt')
    copyfile(datadir + '/text', featdir + '/text')
    copyfile(datadir + '/wav.scp', featdir + '/wav.scp')

    #write the maximum length in a file
    with open(featdir + '/maxlength', 'w') as fid:
        fid.write(str(max_length))
Ejemplo n.º 6
0
            line = f.readline()
    # 将maxlength写入文件    
    with open(dev_features_dir + "/maxlength", 'w') as f:
        f.write("%s"%max_input_length)
        print("the utt's maxlength is: " + str(max_input_length))

    #create a feature reader
    with open(dev_features_dir + '/maxlength', 'r') as fid:
        max_length = int(fid.read())
    featreader = feature_reader.FeatureReader(dev_features_dir + '/feats.scp', dev_features_dir + '/utt2spk', 
        context_left, context_right, max_length)

    #create an ark writer for the likelihoods
    if os.path.isfile(decodedir + '/likelihoods.ark'):
        os.remove(decodedir + '/likelihoods.ark')
    writer = ark.ArkWriter(decodedir + '/feats.scp', decodedir + '/likelihoods.ark')

    #decode with te neural net
    nnet.decode(featreader, writer)

    print('------- decoding dev sets ----------')
    #copy the gmm model and some files to speaker mapping to the decoding dir
    os.system('cp %s %s' %(config.get('directories', 'expdir') + '/' + config.get('nnet', 'gmm_name') + '/final.mdl', decodedir))
    os.system('cp -r %s %s' %(config.get('directories', 'expdir') + '/' + config.get('nnet', 'gmm_name') + '/graph', decodedir))
    os.system('cp %s %s' %(config.get('directories', 'dev_features') + '/utt2spk', decodedir))
    os.system('cp %s %s' %(config.get('directories', 'dev_features') + '/text', decodedir))
    os.system('cp %s %s' %(config.get('directories', 'dev_features') + '/stm', decodedir))
    os.system('cp %s %s' %(config.get('directories', 'dev_features') + '/glm', decodedir))

    #change directory to kaldi egs
    os.chdir(config.get('directories', 'prjdir'))