コード例 #1
0
ファイル: utils.py プロジェクト: dynfus/dynfus
def getVectorizedPairBatch(pair_batch, modality, train=True):
    # TODO: Refactor
    ftype = 'train' if train else 'val'
    if modality == 's':
        with open('{}s2t/{}.feat_map.pkl'.format(conf['data_dir'], ftype),
                  'rb') as f:
            feat_map = pickle.load(f)
        final_pair_batch = []
        for x, y in pair_batch:
            final_pair_batch.append(
                [ki.read_mat(ku.get_ark_rep(x, feat_map)), y])
        return final_pair_batch
    elif modality == 'ss':
        with open('{}s2t/{}.feat_map.pkl'.format(conf['data_dir'], ftype),
                  'rb') as f:
            feat_map = pickle.load(f)
        final_pair_batch = []
        for x, y in pair_batch:
            e = ki.read_mat(ku.get_ark_rep(x, feat_map))
            final_pair_batch.append([e, e])
        return final_pair_batch
    elif modality == 'v':
        video_feats = np.load(
            '{}v2t/resnext101-action-avgpool-300h/{}.npy'.format(
                conf['data_dir'], ftype))
        return [[video_feats[idx], y] for idx, (_, y) in enumerate(pair_batch)]
    elif modality == 'vv':
        video_feats = np.load(
            '{}v2t/resnext101-action-avgpool-300h/{}.npy'.format(
                conf['data_dir'], ftype))
        final_pair_batch = []
        for idx, (_, y) in enumerate(pair_batch):
            e = video_feats[idx]
            final_pair_batch.append([e, e])
        return final_pair_batch
    elif modality in ['sv', 'ss-vv']:
        # Load speech vectors
        with open('{}s2t/{}.feat_map.pkl'.format(conf['data_dir'], ftype),
                  'rb') as f:
            feat_map = pickle.load(f)
        # Load video vectors
        video_feats = np.load(
            '{}v2t/resnext101-action-avgpool-300h/{}.npy'.format(
                conf['data_dir'], ftype))
        final_pair_batch = []
        # Note that the text (t) is simply discarded for training mode
        for idx, (x, t) in enumerate(pair_batch):
            if train:
                final_pair_batch.append([
                    video_feats[idx],
                    ki.read_mat(ku.get_ark_rep(x, feat_map))
                ])
            else:
                final_pair_batch.append([
                    video_feats[idx],
                    ki.read_mat(ku.get_ark_rep(x, feat_map)), t
                ])
        return final_pair_batch
コード例 #2
0
    def __getitem__(self, idx):
        rec = self.recs[idx]
        utts = self.rec2utt_dict[rec]
        spkrs = [self.utt2spk_dict[u] for u in utts]

        ref_rttm_lines = self.rttm_lines_from_rec(rec)
        # hyp_rttm_lines, segutts = self.segments_lines_from_rec(rec)
        # assert (segutts == utts).all()
        segcols = self.segments_cols_from_rec(rec)

        okay_feats = []
        okay_spkrs = []
        okay_idx = []

        fpaths = [self.utt_fpath_dict[utt] for utt in utts]
        for i, fpath in enumerate(fpaths):
            try:
                okay_feats.append(torch.FloatTensor(read_mat(fpath)))
                okay_spkrs.append(spkrs[i])
                okay_idx.append(i)
            except:
                print('Reading utterance {} failed'.format(utts[i]))
                continue

        okay_idx = np.array(okay_idx)
        get_lines = lambda a: a[okay_idx]
        newsegcols = [get_lines(c) for c in segcols]

        return okay_feats, okay_spkrs, ref_rttm_lines, newsegcols, rec
コード例 #3
0
def recognize(args):
    # model
    char_list, sos_id, eos_id = process_dict(args.dict)
    vocab_size = len(char_list)
    encoder = Encoder(
        args.d_input * args.LFR_m,
        args.n_layers_enc,
        args.n_head,
        args.d_k,
        args.d_v,
        args.d_model,
        args.d_inner,
        dropout=args.dropout,
        pe_maxlen=args.pe_maxlen,
    )
    decoder = Decoder(
        sos_id,
        eos_id,
        vocab_size,
        args.d_word_vec,
        args.n_layers_dec,
        args.n_head,
        args.d_k,
        args.d_v,
        args.d_model,
        args.d_inner,
        dropout=args.dropout,
        tgt_emb_prj_weight_sharing=args.tgt_emb_prj_weight_sharing,
        pe_maxlen=args.pe_maxlen,
    )
    model = Transformer(encoder, decoder)
    model.load_state_dict(flow.load(args.model_path))
    device = flow.device("cuda")
    model.eval()
    model.to(device)
    LFR_m = args.LFR_m
    LFR_n = args.LFR_n
    char_list, sos_id, eos_id = process_dict(args.dict)
    assert model.decoder.sos_id == sos_id and model.decoder.eos_id == eos_id

    # read json data
    with open(args.recog_json, "rb") as f:
        js = json.load(f)["utts"]

    # decode each utterance
    new_js = {}
    with flow.no_grad():
        for idx, name in enumerate(js.keys(), 1):
            print("(%d/%d) decoding %s" % (idx, len(js.keys()), name), flush=True)
            input = kaldi_io.read_mat(js[name]["input"][0]["feat"])
            input = build_LFR_features(input, LFR_m, LFR_n)
            input = flow.tensor(input).to(dtype=flow.float32)
            input_length = flow.tensor([input.size(0)], dtype=flow.int64)
            input = input.to(device)
            input_length = input_length.to(device)
            nbest_hyps = model.recognize(input, input_length, char_list, args)
            new_js[name] = add_results_to_json(js[name], nbest_hyps, char_list)

    with open(args.result_label, "wb") as f:
        f.write(json.dumps({"utts": new_js}, indent=4, sort_keys=True).encode("utf_8"))
コード例 #4
0
def recognize(args):
    model, LFR_m, LFR_n = Transformer.load_model(args.model_path)
    print(model)
    model.eval()
    model.cuda()
    char_list, sos_id, eos_id = process_dict(args.dict)
    assert model.decoder.sos_id == sos_id and model.decoder.eos_id == eos_id

    # read json data
    with open(args.recog_json, 'rb') as f:
        js = json.load(f)['utts']
    # import Language Model
    lm_model = kenlm.Model(args.lm_path)
    # decode each utterance
    new_js = {}
    with torch.no_grad():
        for idx, name in enumerate(js.keys(), 1):
            print('(%d/%d) decoding %s' % (idx, len(js.keys()), name),
                  flush=True)
            input = kaldi_io.read_mat(js[name]['input'][0]['feat'])  # TxD
            input = build_LFR_features(input, LFR_m, LFR_n)
            input = torch.from_numpy(input).float()
            input_length = torch.tensor([input.size(0)], dtype=torch.int)
            input = input.cuda()
            input_length = input_length.cuda()
            nbest_hyps = model.recognize(input, input_length, char_list,
                                         lm_model, args)
            new_js[name] = add_results_to_json(js[name], nbest_hyps, char_list)

    with open(args.result_label, 'wb') as f:
        f.write(
            json.dumps({
                'utts': new_js
            }, indent=4, sort_keys=True).encode('utf_8'))
コード例 #5
0
    def __getitem__(self, counter):
        index = self.mapping[counter]
        utt_id = self.key_dic[index]
        X = np.expand_dims(read_mat(self.ark_dic[index]), axis=0)
        y = self.label_dic[index]

        return utt_id, X, y
コード例 #6
0
def load_utt(ark, utt, position):
    with open(ark, 'rb') as f:
        f.seek(position - len(utt) - 1)
        ark_key = kaldi_io.read_key(f)
        assert ark_key == utt, f'Keys does not match: `{ark_key}` and `{utt}`.'
        mat = kaldi_io.read_mat(f)
        return mat
コード例 #7
0
ファイル: dataset.py プロジェクト: xdcesc/pychain_example
    def __init__(self, feat_dir, fst_dir):
        super(ChainDataset, self).__init__()
        self.feat_dir = feat_dir
        self.fst_dir = fst_dir

        self.feat_scp = os.path.join(self.feat_dir, 'feats.scp')
        self.fst_scp = os.path.join(self.fst_dir, 'fst_nor.1.scp')
        self.feat_len_map = os.path.join(self.feat_dir, 'utt2featlen.txt')

        if not os.path.exists(self.feat_len_map):
            print(
                '{} does not exist, generating utt2featlen.txt (a map from utt_id'
                ' to feature length) for the first time... It is used to form a minibatch'
                ' with similar length in training.'.format(self.feat_len_map))
            with open(self.feat_len_map, 'w') as map_f:
                with open(self.feat_scp) as f:
                    for i, line in tqdm(enumerate(f)):
                        utt_id, feat_ark = line.strip().split()
                        feat = kaldi_io.read_mat(feat_ark)
                        feat_len = feat.shape[0]
                        map_f.write('{} {}\n'.format(utt_id, feat_len))
        # Pairing
        self.samples = []  # list of dicts
        self.utt_ids = {}  # a dict that maps utt_ids(str) to id(int)

        self.samples_tmp = []
        with open(self.feat_scp) as f:
            for i, line in enumerate(f):
                utt_id, feat_ark = line.strip().split()
                self.utt_ids[utt_id] = i
                self.samples_tmp.append({'utt_id': utt_id, 'feat': feat_ark})

        with open(self.feat_len_map) as f:
            for i, line in enumerate(f):
                utt_id, feat_len = line.strip().split()
                id = self.utt_ids[utt_id]
                self.samples_tmp[id]['feat_len'] = int(feat_len)

        # we always cache all fsts into memory at once as its relatively small
        with open(self.fst_scp) as f:
            print("Loading training FSTs...")
            for i, line in tqdm(enumerate(f)):
                utt_id, fst_rxf = line.strip().split()
                if utt_id not in self.utt_ids:
                    raise ValueError(
                        '{} has no corresponding feats'.format(utt_id))
                id = self.utt_ids[utt_id]
                filename, offset = self.parse_rxfile(fst_rxf)
                filename = filename.split('/')[-1]
                file_path = os.path.join(self.fst_dir, filename)
                fst = simplefst.StdVectorFst.read_ark(file_path, offset)
                graph = ChainGraph(fst)
                dict_tmp = self.samples_tmp[id]
                dict_tmp['graph'] = graph
                self.samples.append(dict_tmp)
                #self.samples[id]['graph'] = graph

        # sort the samples by their feature length
        self.samples = sorted(self.samples,
                              key=lambda sample: sample['feat_len'])
コード例 #8
0
def load_data(trainning_triples):
    data = {}
    data['key'] = [triples[0] for triples in trainning_triples]
    data['src_seq'] = [triples[1] for triples in trainning_triples]
    data['tgt_seq'] = [triples[2] for triples in trainning_triples]

    #load data
    loaded = []
    for scripts in data['src_seq']:
        mat = kaldi_io.read_mat(scripts)
        loaded.append(mat)
    data['src_seq'] = loaded

    data['src_seq'], data['src_pad_mask'] = instances_handler.pad_to_longest(
        data['src_seq'])
    data['tgt_seq'], data['tgt_pad_mask'] = instances_handler.pad_to_longest(
        data['tgt_seq'])

    data['src_seq'] = np.array(data['src_seq'])
    data['src_pad_mask'] = np.array(data['src_pad_mask'])
    data['tgt_seq'] = np.array(data['tgt_seq'])
    data['tgt_pad_mask'] = np.array(data['tgt_pad_mask'])

    archive = {
        'key': data['key'],
        'src_seq': data['src_seq'],
        'src_pad_mask': data['src_pad_mask'],
        'tgt_seq': data['tgt_seq'],
        'tgt_pad_mask': data['tgt_pad_mask']
    }
    return archive
コード例 #9
0
ファイル: utils.py プロジェクト: valentinp72/espresso
def compute_num_frames_from_feat_or_waveform(rxfile: str) -> int:
    if re.search(r"\.ark:\d+$", rxfile.strip()) is not None:  # from feats.scp
        if not has_kaldi_io:
            raise ImportError(
                "Please install kaldi_io with: pip install kaldi_io")
        try:
            feat = kaldi_io.read_mat(rxfile)
        except Exception:
            raise Exception("failed to read feature matrix {}.".format(rxfile))
        assert feat is not None and isinstance(feat, np.ndarray)
        num_frames = feat.shape[0]
    elif re.search(r"\|$", rxfile.strip()) is not None:  # from a command
        source = BytesIO(run(rxfile[:-1], shell=True, stdout=PIPE).stdout)
        waveform, sample_rate = get_waveform(source, always_2d=True)
        num_frames = num_samples_to_num_frames(waveform.shape[1],
                                               sample_rate,
                                               frame_length=25.0,
                                               frame_shift=10.0)
    else:  # from a raw waveform file
        if not has_soundfile:
            raise ImportError(
                "Please install soundfile with: pip install soundfile")
        info = soundfile.info(rxfile)
        num_frames = num_samples_to_num_frames(info.frames,
                                               info.samplerate,
                                               frame_length=25.0,
                                               frame_shift=10.0)
    return num_frames
コード例 #10
0
def load_inputs_and_targets(batch, token2idx, label_type, LFR_m, LFR_n):
    # From: espnet/src/asr/asr_utils.py: load_inputs_and_targets
    # load acoustic features and target sequence of token ids
    # for b in batch:
    #     print(b[1]['input'][0]['feat'])
    xs = [kaldi_io.read_mat(b[1]['input'][0]['feat']) for b in batch]
    ys = [b[1]['output'][0][label_type].split() for b in batch]

    if LFR_m != 1 or LFR_n != 1:
        # xs = build_LFR_features(xs, LFR_m, LFR_n)
        xs = [build_LFR_features(x, LFR_m, LFR_n) for x in xs]

    # get index of non-zero length samples
    nonzero_idx = filter(lambda i: len(ys[i]) > 0, range(len(xs)))
    # sort in input lengths
    nonzero_sorted_idx = sorted(nonzero_idx, key=lambda i: -len(xs[i]))
    if len(nonzero_sorted_idx) != len(xs):
        print("warning: Target sequences include empty token")

    # remove zero-lenght samples
    xs = [xs[i] for i in nonzero_sorted_idx]
    ys = [
        np.fromiter(map(lambda x: token2idx[x], ys[i]), dtype=np.int64)
        for i in nonzero_sorted_idx
    ]

    return xs, ys
コード例 #11
0
    def __init__(self, mu_path, ep_path, transform_dir, enroll_feats,
                 test_feats, trial):
        self.S_mu = np.load(mu_path)
        self.S_ep = np.load(ep_path)
        self.enroll = {}
        for key, mat in kaldi_io.read_vec_flt_scp(enroll_feats):
            self.enroll[key] = mat
        self.test = {}
        for key, mat in kaldi_io.read_vec_flt_scp(test_feats):
            self.test[key] = mat
        self.len = len(self.test)
        self.scores = np.zeros((self.len, self.len))
        self.trial_path = trial

        xvectors = []
        for _, mat in kaldi_io.read_vec_flt_scp(enroll_feats):
            xvectors.append(mat)
        xvectors = np.array(xvectors)
        transform = EstPca(xvectors, target_energy=0.1)
        adapt_transform = np.array(kaldi_io.read_mat(transform_dir))
        self.adapt_transform = transform
        self.transform = transform
        self.S_mu = np.dot(np.dot(adapt_transform, self.S_mu),
                           adapt_transform.T)
        self.S_ep = np.dot(np.dot(adapt_transform, self.S_ep),
                           adapt_transform.T)
        self.S_mu = np.dot(np.dot(transform.T, self.S_mu), transform)
        self.S_ep = np.dot(np.dot(transform.T, self.S_ep), transform)
        F = np.linalg.pinv(self.S_ep)
        G = np.dot(
            np.dot(-np.linalg.pinv(2 * self.S_mu + self.S_ep), self.S_mu), F)
        self.A = np.linalg.pinv(self.S_mu + self.S_ep) - (F + G)
        self.G = G

        return
コード例 #12
0
ファイル: mao.py プロジェクト: Chung-I/tsm-rnnt
 def func(utt_ids):
     utt_datas = []
     for utt_id in utt_ids:
         rx_file = raw_src_datas[utt_id]
         utt_datas.append(kaldi_io.read_mat(rx_file))
     source_data = np.concatenate((utt_datas), axis=0)
     return source_data
コード例 #13
0
def save_data_info_tar(tar_file_path,
                       minibatch_info,
                       all_data_info,
                       fea_dim,
                       logger,
                       downsampled=False):
    tar_file = tarfile.TarFile(tar_file_path, 'w')
    for i in range(all_data_info.shape[0]):
        logger.info('Writing minibatch: %d' % (i + 1))
        len_1 = minibatch_info[i][1] / 2 if downsampled else minibatch_info[i][
            1]
        # mat = np.zeros((len(all_data_info[i]), len_1, fea_dim), dtype=np.float32)
        mat = np.zeros((len(all_data_info[i]), len_1, fea_dim),
                       dtype=np.float16)
        for j, read_info in enumerate(all_data_info[i]):
            m = kaldi_io.read_mat(read_info[0])
            len_2 = read_info[2] / 2 if downsampled else read_info[2]
            assert m.shape[1] == mat.shape[2] and len_2 == mat.shape[1]
            temp = m[read_info[1]:read_info[1] + read_info[2], :]
            if downsampled:
                # start from frame 1 to work fine for both odd and even array size
                temp = temp[1::2, :]
                assert temp.shape[0] == len_2
            # mat[j, :, :] = temp
            mat[j, :, :] = temp.astype(dtype=np.float16)
        __add2tar_file(tar_file, mat, 'minibatch_' + str(i) + '.npy')
    tar_file.close()
コード例 #14
0
ファイル: dataset.py プロジェクト: Chaanks/stklia
def make_kaldi_ds(ds_path, seq_len=400, evaluation=False, trials=None):
    """ 
    Make a SpeakerDataset from only the path of the kaldi dataset.
    This function will use the files 'feats.scp', 'utt2spk' 'spk2utt'
    present in ds_path to create the SpeakerDataset.
    """
    if not isinstance(ds_path, list):
        ds_path = [ds_path]

    utt2spk, spk2utt, utt2path = {}, {}, {}
    for _, path in enumerate(ds_path):
        utt2path.update(data_io.read_scp(path / 'feats.scp'))
        utt2spk.update(data_io.read_scp(path / 'utt2spk'))
        # can't do spk2utt.update(t_spk2utt) as update is not additive
        t_spk2utt = data_io.load_one_tomany(path / 'spk2utt')
        for spk, utts in t_spk2utt.items():
            try:
                spk2utt[spk] += utts
            except KeyError:
                spk2utt[spk] = utts

    ds = SpeakerDataset(
        utt2path=utt2path,
        utt2spk=utt2spk,
        spk2utt=spk2utt,
        loading_method=lambda path: torch.FloatTensor(read_mat(path)),
        seq_len=seq_len,
        evaluation=evaluation,
        trials=trials,
    )
    return ds
コード例 #15
0
def load_mfccs_from_numbatch_old(mega_dict, num_to_id_dict, data, device):
    data_mfcc = []
    durs = []
    for i, d in enumerate(data):
        fd = mega_dict[num_to_id_dict[int(d)]]
        data_mfcc_temp = kaldi_io.read_mat(fd).T
        data_mfcc.append(data_mfcc_temp)
        durs.append(data_mfcc_temp.shape[1])
    try:
        tmparr = np.asarray(data_mfcc)
        data_mfcc = tmparr
    except:
        tmparr = np.empty(len(data_mfcc), dtype=object)
        for i in range(len(data_mfcc)):
            tmparr[i] = data_mfcc[i]
        data_mfcc = tmparr
    if len(data_mfcc.shape) > 1:
        tensor_X = torch.from_numpy(np.asarray(data_mfcc)).float().to(device)
        return tensor_X
    else:
        sorted_durs, sort_idx = torch.sort(torch.tensor(durs))
        data_mfcc = data_mfcc[sort_idx]
        _, unsort_idx = torch.sort(sort_idx)
        uniq_durs, uniq_counts = torch.unique(sorted_durs, return_counts=True)
        split_sections = tuple(np.cumsum(uniq_counts)[:-1])
        data_mfcc = np.split(data_mfcc, split_sections)
        tensor_mfcc_list = [
            torch.from_numpy(np.asarray(list(mfcc))).float().to(device)
            for mfcc in data_mfcc
        ]
        return tensor_mfcc_list, sort_idx, unsort_idx
コード例 #16
0
def extract_till_plda_embeddings(model, mega_mfcc_dict, data_loader,
                                 num_to_id_dict, device):
    utts = []
    for x1, x2, l in data_loader:
        for x in x1:
            if num_to_id_dict[int(x)] not in utts:
                utts.append(num_to_id_dict[int(x)])
                # numframes.append(mega_utt2num_frames_dict[num_to_id_dict[int(x)]])
        for x in x2:
            if num_to_id_dict[int(x)] not in utts:
                utts.append(num_to_id_dict[int(x)])
                # numframes.append(mega_utt2num_frames_dict[num_to_id_dict[int(x)]])

    # Here we are forward passing each MFCC one by one, and this is very slow. We need to figure out a way to group the mfccs of similar durations, and extract embeddings together to improve speed and efficiently utilize GPUs.
    model.eval()
    extracted_plda_embeddings = {}
    with torch.no_grad():
        for utt in utts:
            if utt in extracted_plda_embeddings:
                continue
            mfcc = kaldi_io.read_mat(mega_mfcc_dict[utt]).T
            mfcc_t = torch.from_numpy(mfcc[np.newaxis, :, :]).to(device)
            data_extracted_plda_embeddings = model.extract_plda_embeddings(
                mfcc_t)
            extracted_plda_embeddings[utt] = np.asarray(
                data_extracted_plda_embeddings[0].cpu())
    return extracted_plda_embeddings
コード例 #17
0
ファイル: data_4.py プロジェクト: jhvmhg/myLAS
def load_inputs_and_targets(batch):
    # From: espnet/src/asr/asr_utils.py: load_inputs_and_targets
    # load acoustic features and target sequence of token ids
    # for b in batch:
    #     print(b[1]['input'][0]['feat'])
    xs = []
    xs_ = [kaldi_io.read_mat(b[1]['input']['feat']) for b in batch]
    ys = [b[1]['output']['tokenid'] for b in batch]

    for x in xs_:
        xs.append(x[:x.shape[0] // 4 * 4])
    # get index of non-zero length samples
    nonzero_idx = filter(lambda i: len(ys[i]) > 0, range(len(xs)))
    # sort in input lengths
    nonzero_sorted_idx = sorted(nonzero_idx, key=lambda i: -len(xs[i]))
    if len(nonzero_sorted_idx) != len(xs):
        print("warning: Target sequences include empty tokenid")

    # remove zero-lenght samples
    xs = [xs[i] for i in nonzero_sorted_idx]
    ys = [
        np.fromiter(map(int, ys[i]), dtype=np.int64)
        for i in nonzero_sorted_idx
    ]

    return xs, ys
コード例 #18
0
    def get_item_test(self, idx):
        utt = self.utt_list[idx]
        fpath = self.utt_fpath_dict[utt]
        feats = read_mat(fpath)
        feats = torch.FloatTensor(feats)

        label_dict = {}
        speaker = self.utt_spkr_dict[utt]

        if 'speaker' in self.label_types:
            label_dict['speaker'] = torch.LongTensor([speaker])
        if 'gender' in self.label_types:
            label_dict['gender'] = torch.LongTensor(
                [self.spk_gen_dict[speaker]])
        if 'nationality' in self.label_types:
            label_dict['nationality'] = torch.LongTensor(
                [self.spk_nat_dict[speaker]])
        if 'age' in self.label_types:
            label_dict['age'] = torch.LongTensor(
                [self.utt_age_class_dict[utt]])
        if 'age_regression' in self.label_types:
            label_dict['age_regression'] = torch.FloatTensor(
                [self.utt_age_dict[utt]])

        return feats, label_dict
コード例 #19
0
    def __getitem__(self, idx):
        utt_infos = self.data_list[idx]
        utt_id = utt_infos[0]
        utt_feats = kaldi_io.read_mat(utt_infos[1]['input'][0]['feat'])

        # add delta feats
        if self.delta_feats_num > 0:
            utt_feats = [utt_feats]
            for i in range(self.delta_feats_num):
                delta_feats = self.delta(utt_feats[i], N=2)
                utt_feats.append(delta_feats)
            utt_feats = np.concatenate(utt_feats, axis=1)
            
        if self.normalized:
            # FLAG:should do in the train dataset with kaldi
            utt_feats = (utt_feats - np.mean(utt_feats,axis=0)) / np.std(utt_feats,axis=0)

        # add gaussian noise every time __getitem__ is called
        # Note that In graves, etc SPEECH RECOGNITION WITH DEEP RECURRENT NEURAL NETWORKS
        # Weight noise was added once per training sequence, rather than at every timestep
        if self.add_noise and self.dataset_type == 'train':
            utt_feats = np.add(utt_feats, np.random.normal(0,0.6,utt_feats.shape))

        transcript_ids = map(int, utt_infos[1]['output'][0]['token_id'].strip().split())
        return (utt_id, utt_feats, transcript_ids)
コード例 #20
0
ファイル: s2sdata.py プロジェクト: zhiheng-huang/gluon-nlp
    def __getitem__(self, idx):
        key, path, _ = self._index[idx]
        feat = kaldi_io.read_mat(path)
        shape = feat.shape
        window_size = 1 + self._left_context + self._right_context
        out = np.zeros((int(
            math.ceil(
                float(shape[0] - (
                    (self._left_context +
                     self._right_context) if not self._context_pad else 0)) /
                self._sub_sample)), window_size * shape[1]))
        if self._left_context > 0 or self._right_context > 0:
            feat = np.pad(feat,
                          ((self._right_context if self._context_pad else 0,
                            self._right_context if self._context_pad else
                            (shape[0] - window_size) % self._sub_sample),
                           (0, 0)), 'edge')
        out[:, self._left_context * shape[1]:(self._left_context + 1) * shape[1]] = \
            feat[self._left_context:feat.shape[0] - self._right_context:self._sub_sample]
        for i in range(self._left_context):
            # left context
            out[:, shape[1] * i:shape[1] * (i + 1)] = \
                feat[i:feat.shape[0] - self._left_context - self._right_context + i:self._sub_sample, :]

        for i in range(self._right_context):
            # right context
            out[:, shape[1] * (i + self._left_context + 1):shape[1] * (i + self._left_context + 2)] = \
                feat[self._left_context + 1 + i:feat.shape[0] - self._right_context + 1 + i:self._sub_sample, :]
        if len(self._labels) > 0:
            return key, out, self._labels[key]
        else:
            return key, out
コード例 #21
0
ファイル: feat_text_dataset.py プロジェクト: tolysz/espresso
 def __getitem__(self, i):
     self.check_index(i)
     feat = kaldi_io.read_mat(self.rxfiles[i])
     if self.specaugment_config is not None and self.specaugment_config != "":
         with data_utils.numpy_seed(self.seed, self.epoch, i):
             feat = specaug(feat, **eval(self.specaugment_config))
     item = torch.from_numpy(feat).float()
     return item
コード例 #22
0
 def read_data(self):
     self.data_offsets = np.append([0], np.cumsum(self.sizes)[:-1])
     self.buffer = np.empty((sum(self.sizes), self.feat_dim),
                            dtype=self.dtype)
     for i in range(len(self.data_offsets)):
         ptx = self.data_offsets[i]
         dst = self.buffer[ptx:ptx + self.sizes[i]]
         np.copyto(dst, kaldi_io.read_mat(self.extended_filenames[i]))
コード例 #23
0
    def __getitem__(self, i):
        self.check_index(i)
        if not self.prefetch_called:  # no caching
            feat = kaldi_io.read_mat(self.rxfiles[i])
            return torch.from_numpy(feat).float()
        if i not in self.cache_index:
            assert (
                self.start_pos_for_next_cache < len(self.ordered_indices)
            ), "Position for next cache starting beyond the end of ordered_indices."
            try:
                pos_start = self.ordered_indices.index(
                    i,
                    self.start_pos_for_next_cache,
                )
            except ValueError:
                raise ValueError(
                    "index {} not found in self.ordered_indices. Set "
                    "self.ordered_prefetch to False, and/or call self.prefetch() "
                    "with the full list of indices, and then try again.".
                    format(i))
            pos_end = min(
                pos_start + self.cache_size,
                len(self.ordered_indices),
            )
            self.start_pos_for_next_cache = pos_end if self.ordered_prefetch else 0
            total_size = 0
            for idx in self.ordered_indices[pos_start:pos_end]:
                total_size += self.sizes[idx]
            self.cache = np.empty((total_size, self.feat_dim),
                                  dtype=self.dtype)
            ptx = 0
            self.cache_index.clear()
            for idx in self.ordered_indices[pos_start:pos_end]:
                self.cache_index[idx] = ptx
                length = self.sizes[idx]
                dst = self.cache[ptx:ptx + length]
                feat = kaldi_io.read_mat(self.rxfiles[idx])
                if self.specaugment_config is not None and self.specaugment_config != "":
                    with data_utils.numpy_seed(self.seed, self.epoch, idx):
                        feat = specaug(feat, **eval(self.specaugment_config))
                np.copyto(dst, feat)
                ptx += length

        ptx = self.cache_index[i]
        a = self.cache[ptx:ptx + self.sizes[i]].copy()
        return torch.from_numpy(a).float()
コード例 #24
0
    def __init__(
        self,
        utt_ids: List[str],
        rxfiles: List[str],
        utt2num_frames: Optional[List[int]] = None,
        feat_dim: Optional[
            int] = None,  # only relevant when reading from raw waveforms
        feature_type: Optional[
            str] = None,  # currently support fbank or mfcc; only relevant when reading from raw waveforms
        seed=1,
        feature_transforms_config: Optional[Dict[str, Any]] = None,
    ):
        super().__init__()
        assert len(utt_ids) == len(rxfiles)
        self.dtype = np.float
        self.utt_ids = utt_ids
        self.rxfiles = rxfiles
        self.size = len(utt_ids)  # number of utterances
        self.sizes = [
        ]  # length of each utterance in terms of the number of frames
        if utt2num_frames is not None and len(utt2num_frames) > 0:
            assert len(utt2num_frames) == self.size
            self.sizes = utt2num_frames

        first_rxfile = rxfiles[0]
        if re.search(r"\.ark:\d+$",
                     first_rxfile.strip()) is not None:  # from feats.scp
            self.input_format = "feat"
            self.feat_dim = kaldi_io.read_mat(first_rxfile).shape[
                1]  # feature dimension
        else:
            self.input_format = ("command" if re.search(
                r"\|$", first_rxfile.strip()) is not None else "wave")
            self.feat_dim = feat_dim
            self.feature_type = feature_type
            assert self.feat_dim is not None
            assert self.feature_type in ["fbank", "mfcc"]

        if len(self.sizes) == 0:
            logger.info("Computing number of frames from audios...")
            with ThreadPoolExecutor(max_workers=32) as ex:
                futures = []
                for rxfile in self.rxfiles:
                    futures.append(
                        ex.submit(compute_num_frames_from_feat_or_waveform,
                                  rxfile))

                for future in tqdm(futures, desc="Processing", leave=False):
                    result = future.result()
                    self.sizes.append(result)

        assert len(self.sizes) == self.size
        self.sizes = np.array(self.sizes, dtype=np.int32)
        self.feature_transforms = CompositeAudioFeatureTransform.from_config_dict(
            config=feature_transforms_config)
        self.seed = seed
        self.epoch = 1
コード例 #25
0
 def _load_cmvn(self, cmvn_file):
     cmvn = kaldi_io.read_mat(cmvn_file)
     assert cmvn.shape[0] == 2
     cnt = cmvn[0, -1]
     sums = cmvn[0, :-1]
     sums2 = cmvn[1, :-1]
     means = sums / cnt
     stds = np.sqrt(np.maximum(1e-10, sums2 / cnt - means ** 2))
     return means, stds
コード例 #26
0
def extract_xvectors(model, mega_mfcc_dict, device):
    model.eval()
    extracted_xvectors = {}
    for utt in mega_mfcc_dict:
        mfcc = kaldi_io.read_mat(mega_mfcc_dict[utt]).T
        mfcc_t = torch.from_numpy(mfcc[np.newaxis, :, :]).to(device)
        data_extracted_xvectors = model.xvector_extractor.extract(mfcc_t)
        extracted_xvectors[utt] = data_extracted_xvectors[0].cpu().detach(
        ).numpy()
    return extracted_xvectors
コード例 #27
0
def main():
    filename = sys.argv[1]
    out_path = sys.argv[2]
    lang_emb_start = int(sys.argv[3])
    lang_emb_end = int(sys.argv[4])
    emb = kaldi_io.read_mat(filename)
    emb = np.transpose(emb, (1, 0))[lang_emb_start:lang_emb_end, :]
    print(emb.shape)
    plt.matshow(emb, fignum=None)
    plt.savefig(out_path + '.png')
コード例 #28
0
ファイル: utils.py プロジェクト: cst781/asr_project
def read_all_data(feat_scp):
    feat_fid = open(feat_scp, 'r')
    feat = feat_fid.readlines()
    feat_fid.close()
    mat_list = []

    for i in range(len(feat)):
        _, ark = feat[i].split()
        mat = kaldi_io.read_mat(ark)
        mat_list.append(mat)
    return np.concatenate(mat_list, axis=0)
コード例 #29
0
def read_mat_key(file, target_key):
    """read the matrix of the target key/utterance from a kaldi scp file
    """
    fd = ko.open_or_fd(file)
    try:
        for line in fd:
            (key, rxfile) = line.decode().split(' ')
            if key == target_key:
                return ko.read_mat(rxfile)
    finally:
        if fd is not file: fd.close()
コード例 #30
0
    def __getitem__(self, index):
        """Generate samples
        """
        rxfile = self.rxfiles[index]
        full_mat = kaldi_io.read_mat(rxfile)
        assert len(full_mat) >= self.seq_len
        pin = np.random.randint(0, len(full_mat) - self.seq_len + 1)
        chunk_mat = full_mat[pin:pin + self.seq_len, :]
        y = np.array(self.labels[index])

        return chunk_mat, y