Exemple #1
0
def run(args):
    priors = [[1.0, 1.0, 1.0]]
    if args.priors is not None:
        priors = common_lib.read_matrix_ascii(args.priors)
        if len(priors) != 0 and len(priors[0]) != 3:
            raise RuntimeError("Invalid dimension for priors {0}"
                               "".format(priors))

    priors_sum = sum(priors[0])
    sil_prior = old_div(priors[0][0], priors_sum)
    speech_prior = old_div(priors[0][1], priors_sum)
    garbage_prior = old_div(priors[0][2], priors_sum)

    transform_mat = [[
        old_div(args.sil_scale, sil_prior),
        old_div(args.speech_in_sil_weight, speech_prior),
        old_div(args.garbage_in_sil_weight, garbage_prior)
    ],
                     [
                         old_div(args.sil_in_speech_weight, sil_prior),
                         old_div(1.0, speech_prior),
                         old_div(args.garbage_in_speech_weight, garbage_prior)
                     ]]

    common_lib.write_matrix_ascii(sys.stdout, transform_mat)
Exemple #2
0
def run(args):
    priors = [[1.0, 1.0, 1.0]]
    if args.priors is not None:
        priors = common_lib.read_matrix_ascii(args.priors)
        if len(priors) != 0 and len(priors[0]) != 3:
            raise RuntimeError("Invalid dimension for priors {0}"
                               "".format(priors))

    priors_sum = sum(priors[0])
    sil_prior = priors[0][0] / priors_sum
    speech_prior = priors[0][1] / priors_sum
    garbage_prior = priors[0][2] / priors_sum

    transform_mat = [
        [
            args.sil_scale / sil_prior,
            args.speech_in_sil_weight / speech_prior,
            args.garbage_in_sil_weight / garbage_prior,
        ],
        [
            args.sil_in_speech_weight / sil_prior,
            1.0 / speech_prior,
            args.garbage_in_speech_weight / garbage_prior,
        ],
    ]

    common_lib.write_matrix_ascii(sys.stdout, transform_mat)
Exemple #3
0
def run(args):
    num_done = 0

    with common_lib.smart_open(
            args.pasted_targets) as targets_reader, common_lib.smart_open(
                args.out_targets, "w") as targets_writer:
        for key, mat in common_lib.read_mat_ark(targets_reader):
            mat = np.matrix(mat)
            if mat.shape[1] % args.dim != 0:
                raise RuntimeError(
                    "For utterance {utt} in {f}, num-columns {nc} "
                    "is not a multiple of dim {dim}"
                    "".format(
                        utt=key,
                        f=args.pasted_targets.name,
                        nc=mat.shape[1],
                        dim=args.dim,
                    ))
            num_sources = mat.shape[1] // args.dim

            out_mat = np.matrix(np.zeros([mat.shape[0], args.dim]))

            if args.remove_mismatch_frames:
                for n in range(mat.shape[0]):
                    if should_remove_frame(mat[n, :].getA()[0], args.dim):
                        out_mat[n, :] = np.zeros([1, args.dim])
                    else:
                        for i in range(num_sources):
                            out_mat[n, :] += mat[n, (i * args.dim):(
                                (i + 1) *
                                args.dim)] * (1.0 if args.weights is None else
                                              args.weights[i])
            else:
                # Just interpolate the targets
                for i in range(num_sources):
                    out_mat += mat[:, (i * args.dim):((i + 1) * args.dim)] * (
                        1.0 if args.weights is None else args.weights[i])

            common_lib.write_matrix_ascii(targets_writer,
                                          out_mat.tolist(),
                                          key=key)
            num_done += 1

    logger.info("Merged {num_done} target matrices"
                "".format(num_done=num_done))

    if num_done == 0:
        raise RuntimeError
Exemple #4
0
def run(args):
    # Load priors.
    # - priors[0]  --  prior probability of non-speech
    # - priors[1]  --  prior probability of speech
    # - priors[2]  --  prior probability of garbage; ignored
    priors = common_lib.read_matrix_ascii(args.priors)
    if len(priors) != 0 and len(priors[0]) != 3:
        raise RuntimeError(f'Invalid dimension for priors {priors}')
    priors = np.squeeze(np.array(priors, dtype=np.float64))

    # Create matrix that converts posteriors to likelihoods by dividing by
    # normalized priors.
    pmass = priors[0] + priors[1]  # Total mass devoted to speech/non-speech.
    priors /= pmass
    transform_mat = np.diag(1 / priors)
    transform_mat[2, 2] = 0.0  # Ignore garbage entirely
    transform_mat[1, 1] *= args.speech_likelihood_weight
    common_lib.write_matrix_ascii(sys.stdout, transform_mat)
Exemple #5
0
def run(args):
    num_done = 0

    with common_lib.smart_open(args.pasted_targets) as targets_reader, \
            common_lib.smart_open(args.out_targets, 'w') as targets_writer:
        for key, mat in common_lib.read_mat_ark(targets_reader):
            mat = np.matrix(mat)
            if mat.shape[1] % args.dim != 0:
                raise RuntimeError(
                    "For utterance {utt} in {f}, num-columns {nc} "
                    "is not a multiple of dim {dim}"
                    "".format(utt=key, f=args.pasted_targets.name,
                              nc=mat.shape[1], dim=args.dim))
            num_sources = mat.shape[1] // args.dim

            out_mat = np.matrix(np.zeros([mat.shape[0], args.dim]))

            if args.remove_mismatch_frames:
                for n in range(mat.shape[0]):
                    if should_remove_frame(mat[n, :].getA()[0], args.dim):
                        out_mat[n, :] = np.zeros([1, args.dim])
                    else:
                        for i in range(num_sources):
                            out_mat[n, :] += (
                                mat[n, (i * args.dim) : ((i+1) * args.dim)]
                                * (1.0 if args.weights is None
                                   else args.weights[i]))
            else:
                # Just interpolate the targets
                for i in range(num_sources):
                    out_mat += (
                        mat[:, (i * args.dim) : ((i+1) * args.dim)]
                        * (1.0 if args.weights is None else args.weights[i]))

            common_lib.write_matrix_ascii(targets_writer, out_mat.tolist(),
                                          key=key)
            num_done += 1

    logger.info("Merged {num_done} target matrices"
                "".format(num_done=num_done))

    if num_done == 0:
        raise RuntimeError
def run(args):
    priors = [[1.0, 1.0, 1.0]]
    if args.priors is not None:
        priors = common_lib.read_matrix_ascii(args.priors)
        if len(priors) != 0 and len(priors[0]) != 3:
            raise RuntimeError("Invalid dimension for priors {0}"
                               "".format(priors))

    priors_sum = sum(priors[0])
    sil_prior = priors[0][0] / priors_sum
    speech_prior = priors[0][1] / priors_sum
    garbage_prior = priors[0][2] / priors_sum

    transform_mat = [[args.sil_scale / sil_prior,
                      args.speech_in_sil_weight / speech_prior,
                      args.garbage_in_sil_weight / garbage_prior],
                     [args.sil_in_speech_weight / sil_prior,
                      1.0 / speech_prior,
                      args.garbage_in_speech_weight / garbage_prior]]

    common_lib.write_matrix_ascii(sys.stdout, transform_mat)
Exemple #7
0
def run(args):
    num_utts = 0
    for key, mat in common_lib.read_mat_ark(args.targets_in_ark):
        mat = np.matrix(mat)
        if args.subsampling_factor > 0:
            num_indexes = (old_div(
                (mat.shape[0] + args.subsampling_factor - 1),
                args.subsampling_factor))

        out_mat = np.zeros([num_indexes, mat.shape[1]])
        i = 0
        for k in range(int(old_div(args.subsampling_factor, 2.0)),
                       mat.shape[0], args.subsampling_factor):
            st = int(k - old_div(float(args.subsampling_factor), 2.0))
            end = int(k + old_div(float(args.subsampling_factor), 2.0))

            if st < 0:
                st = 0
            if end > mat.shape[0]:
                end = mat.shape[0]

            try:
                out_mat[i, :] = old_div(np.sum(mat[st:end, :], axis=0),
                                        float(end - st))
            except IndexError:
                logger.error("mat.shape = {0}, st = {1}, end = {2}"
                             "".format(mat.shape, st, end))
                raise
            assert i == old_div(k, args.subsampling_factor)
            i += 1

        common_lib.write_matrix_ascii(args.targets_out_ark, out_mat, key=key)
        num_utts += 1
    args.targets_in_ark.close()
    args.targets_out_ark.close()

    logger.info("Sub-sampled {num_utts} target matrices"
                "".format(num_utts=num_utts))
Exemple #8
0
def run(args):
    num_utts = 0
    for key, mat in common_lib.read_mat_ark(args.targets_in_ark):
        mat = np.matrix(mat)
        if args.subsampling_factor > 0:
            num_indexes = ((mat.shape[0] + args.subsampling_factor - 1)
                            / args.subsampling_factor)

        out_mat = np.zeros([num_indexes, mat.shape[1]])
        i = 0
        for k in range(int(args.subsampling_factor / 2.0),
                       mat.shape[0], args.subsampling_factor):
            st = int(k - float(args.subsampling_factor) / 2.0)
            end = int(k + float(args.subsampling_factor) / 2.0)

            if st < 0:
                st = 0
            if end > mat.shape[0]:
                end = mat.shape[0]

            try:
                out_mat[i, :] = np.sum(mat[st:end, :], axis=0) / float(end - st)
            except IndexError:
                logger.error("mat.shape = {0}, st = {1}, end = {2}"
                             "".format(mat.shape, st, end))
                raise
            assert i == k / args.subsampling_factor
            i += 1

        common_lib.write_matrix_ascii(args.targets_out_ark, out_mat, key=key)
        num_utts += 1
    args.targets_in_ark.close()
    args.targets_out_ark.close()

    logger.info("Sub-sampled {num_utts} target matrices"
                "".format(num_utts=num_utts))
Exemple #9
0
def run(args):
    silence_phones = {}
    with common_lib.smart_open(args.silence_phones) as silence_phones_fh:
        for line in silence_phones_fh:
            silence_phones[line.strip().split()[0]] = 1

    if len(silence_phones) == 0:
        raise RuntimeError("Could not find any phones in {silence}"
                           "".format(silence=args.silence_phones))

    garbage_phones = {}
    with common_lib.smart_open(args.garbage_phones) as garbage_phones_fh:
        for line in garbage_phones_fh:
            word = line.strip().split()[0]
            if word in silence_phones:
                raise RuntimeError("Word '{word}' is in both {silence} "
                                   "and {garbage}".format(
                                       word=word,
                                       silence=args.silence_phones,
                                       garbage=args.garbage_phones))
            garbage_phones[word] = 1

    if len(garbage_phones) == 0:
        raise RuntimeError("Could not find any phones in {garbage}"
                           "".format(garbage=args.garbage_phones))

    num_utts = 0
    num_err = 0
    targets = []
    prev_utt = ""

    with common_lib.smart_open(args.arc_info) as arc_info_reader, \
            common_lib.smart_open(args.targets_file, 'w') as targets_writer:
        for line in arc_info_reader:
            try:
                parts = line.strip().split()
                utt = parts[0]

                if utt != prev_utt:
                    if prev_utt != "":
                        if len(targets) > 0:
                            num_utts += 1
                            common_lib.write_matrix_ascii(
                                targets_writer, targets, key=prev_utt)
                        else:
                            num_err += 1
                    prev_utt = utt
                    targets = []

                start_frame = int(parts[1])
                num_frames = int(parts[2])
                post = float(parts[3])
                phone = parts[4]

                if start_frame + num_frames > len(targets):
                    for t in range(len(targets), start_frame + num_frames):
                        targets.append([0, 0, 0])
                    assert start_frame + num_frames == len(targets)

                for t in range(start_frame, start_frame + num_frames):
                    if phone in silence_phones:
                        targets[t][0] += post
                    elif num_frames > args.max_phone_length:
                        targets[t][2] += post
                    elif phone in garbage_phones:
                        targets[t][2] += post
                    else:
                        targets[t][1] += post
            except Exception:
                logger.error("Failed to process line {line} in {f}"
                             "".format(line=line.strip(), f=args.arc_info))
                logger.error("len(targets) = {l}".format(l=len(targets)))
                raise

    if prev_utt != "":
        if len(targets) > 0:
            num_utts += 1
            common_lib.write_matrix_ascii(args.targets_file, targets,
                                          key=prev_utt)
        else:
            num_err += 1

    logger.info("Wrote {num_utts} targets; failed with {num_err}"
                "".format(num_utts=num_utts, num_err=num_err))
    if num_utts == 0 or num_err >= num_utts / 2:
        raise RuntimeError
def run(args):
    # Get all reco to num_frames, which will be used to decide the number of
    # rows of matrix
    reco2num_frames = {}
    with common_lib.smart_open(args.reco2num_frames) as f:
        for line in f:
            fields = line.strip().split()
            if len(fields) != 2:
                raise ValueError("Could not parse line {0}".format(line))
            reco2num_frames[fields[0]] = int(fields[1])

    # We read all segments and store as a list of objects
    segments = []
    with common_lib.smart_open(args.rttm) as f:
        for line in f.readlines():
            segment_fields = line.strip().split()
            start = float(segment_fields[3])
            duration = float(segment_fields[4])
            end = start + duration
            segments.append(
                Segment(reco=segment_fields[1],
                        spk=segment_fields[7],
                        start=start,
                        dur=duration,
                        end=end))

    keyfunc = lambda x: x.reco
    segments_iterable = sorted(segments, key=keyfunc)
    reco2segs = defaultdict(list, {
        reco: list(g)
        for reco, g in itertools.groupby(segments_iterable, keyfunc)
    })

    # Now, for each reco, create a matrix of shape num_frames x 2 and fill in using
    # the segments information for that reco
    reco2targets = {}
    for reco_id in reco2num_frames:
        segs = sorted(reco2segs[reco_id], key=lambda x: x.start)

        target_val = 1 - args.label_smoothing
        other_val = args.label_smoothing / 2
        silence_vec = np.array([target_val, other_val], dtype=np.float)
        speech_vec = np.array([other_val, target_val], dtype=np.float)
        num_targets = [0, 0]
        # The default target (if not  speech) is silence
        targets_mat = np.tile(silence_vec, (reco2num_frames[reco_id], 1))
        # Now iterate over all segments of the recording and assign targets
        for seg in segs:
            start_frame = int(seg.start / args.frame_shift)
            end_frame = min(int(seg.end / args.frame_shift),
                            reco2num_frames[reco_id])
            num_frames = end_frame - start_frame
            if (num_frames <= 0):
                continue

            targets_mat[start_frame:end_frame] = np.tile(
                speech_vec, (num_frames, 1))
            num_targets[1] += end_frame - start_frame

        num_targets[0] = reco2num_frames[reco_id] - sum(num_targets)
        reco2targets[reco_id] = targets_mat

    with common_lib.smart_open(args.out_targets_ark, 'w') as f:
        for reco_id in sorted(reco2targets.keys()):
            common_lib.write_matrix_ascii(f,
                                          reco2targets[reco_id].tolist(),
                                          key=reco_id)
Exemple #11
0
def run(args):
    reco2utt = read_reco2utt_file(args.reco2utt)
    reco2num_frames = read_reco2num_frames_file(args.reco2num_frames)
    segments = read_segments_file(args.segments, reco2utt)
    targets = read_targets_scp(args.targets_scp, segments)

    if args.default_targets is not None:
        # Read the vector of default targets for out-of-segment regions
        default_targets = np.matrix(
            common_lib.read_matrix_ascii(args.default_targets))
    else:
        default_targets = np.zeros([1, 3])
    assert np.shape(default_targets)[0] == 1 and np.shape(
        default_targets)[1] == 3

    num_utt_err = 0
    num_utt = 0
    num_reco = 0

    with common_lib.smart_open(args.out_targets_ark, "w") as fh:
        for reco, utts in reco2utt.items():
            # Read a recording and the list of its utterances from the
            # reco2utt dictionary
            reco_mat = np.repeat(default_targets,
                                 reco2num_frames[reco],
                                 axis=0)
            utts.sort(key=lambda x: segments[x][1])  # sort on start time

            end_frame_accounted = 0

            for i, utt in enumerate(utts):
                if utt not in segments or utt not in targets:
                    num_utt_err += 1
                    continue
                segment = segments[utt]

                # Read the targets corresponding to the segments
                cmd = "copy-feats --binary=false {mat_fn} -" "".format(
                    mat_fn=targets[utt])
                p = subprocess.Popen(cmd,
                                     shell=True,
                                     stdout=subprocess.PIPE,
                                     stderr=subprocess.PIPE)

                try:
                    mat = np.matrix(common_lib.read_matrix_ascii(p.stdout),
                                    dtype="float32")
                except Exception:
                    logger.error("Command '{cmd}' failed".format(cmd=cmd))
                    raise
                finally:
                    [stdout, stderr] = p.communicate()
                    if p.returncode is not None and p.returncode != 0:
                        raise RuntimeError(
                            'Command "{cmd}" failed with status {status}; '
                            "stderr = {stderr}".format(cmd=cmd,
                                                       status=-p.returncode,
                                                       stderr=stderr))

                start_frame = int(segment[1] / args.frame_shift + 0.5)
                end_frame = int(segment[2] / args.frame_shift + 0.5)
                num_frames = end_frame - start_frame

                if num_frames <= 0:
                    raise ValueError("Invalid line in segments file {0}"
                                     "".format(segment))

                if abs(mat.shape[0] - num_frames) > args.length_tolerance:
                    logger.warning("For utterance {utt}, mismatch in segment "
                                   "length and targets matrix size; "
                                   "{s_len} vs {t_len}".format(
                                       utt=utt,
                                       s_len=num_frames,
                                       t_len=mat.shape[0]))
                    num_utt_err += 1
                    continue

                # Fix end_frame and num_frames if the segment goes beyond
                # the length of the recording.
                if end_frame > reco2num_frames[reco]:
                    end_frame = reco2num_frames[reco]
                    num_frames = end_frame - start_frame

                # Fix "num_frames" and "end_frame" if "num_frames" is lower
                # than the size of the targets matrix "mat"
                num_frames = min(num_frames, mat.shape[0])
                end_frame = start_frame + num_frames

                if num_frames <= 0:
                    logger.warning("For utterance {utt}, start-frame {start} "
                                   "is outside the recording"
                                   "".format(utt=utt, start=start_frame))
                    num_utt_err += 1
                    continue

                if end_frame < end_frame_accounted:
                    logger.warning("For utterance {utt}, end-frame {end} "
                                   "is before the end of a previous segment. "
                                   "i.e. this segment is completely within "
                                   "another segment. Ignoring this segment."
                                   "".format(utt=utt, end=end_frame))
                    num_utt_err += 1
                    continue

                if start_frame < end_frame_accounted:
                    # Segment overlaps with a previous utterance
                    # Combine targets using a weighted interpolation using a
                    # triangular window with a weight of 1 at the start/end of
                    # overlap and 0 at the end/start of the segment
                    for n in range(0, end_frame_accounted - start_frame):
                        w = float(n) / float(end_frame_accounted - start_frame)
                        reco_mat[n + start_frame, :] = (
                            reco_mat[n + start_frame, :] * (1.0 - w) +
                            mat[n, :] * w)

                    if end_frame > end_frame_accounted:
                        reco_mat[end_frame_accounted:end_frame, :] = mat[(
                            end_frame_accounted -
                            start_frame):(end_frame - start_frame), :, ]
                else:
                    # No overlap with the previous utterances.
                    # So just add it to the output.
                    reco_mat[start_frame:end_frame, :] = mat[0:num_frames, :]
                logger.debug(
                    "reco_mat shape = %s, mat shape = %s, "
                    "start_frame = %d, end_frame = %d",
                    reco_mat.shape,
                    mat.shape,
                    start_frame,
                    end_frame,
                )

                end_frame_accounted = end_frame
                num_utt += 1

            if reco_mat.shape[0] > 0:
                common_lib.write_matrix_ascii(fh, reco_mat, key=reco)
                num_reco += 1

    logger.info("Merged {num_utt} segment targets from {num_reco} recordings; "
                "failed with {num_utt_err} utterances"
                "".format(num_utt=num_utt,
                          num_reco=num_reco,
                          num_utt_err=num_utt_err))

    if num_utt == 0 or num_utt_err > num_utt // 2 or num_reco == 0:
        raise RuntimeError
Exemple #12
0
def run(args):
    silence_phones = {}
    with common_lib.smart_open(args.silence_phones) as silence_phones_fh:
        for line in silence_phones_fh:
            silence_phones[line.strip().split()[0]] = 1

    if len(silence_phones) == 0:
        raise RuntimeError("Could not find any phones in {silence}"
                           "".format(silence=args.silence_phones))

    garbage_phones = {}
    with common_lib.smart_open(args.garbage_phones) as garbage_phones_fh:
        for line in garbage_phones_fh:
            word = line.strip().split()[0]
            if word in silence_phones:
                raise RuntimeError("Word '{word}' is in both {silence} "
                                   "and {garbage}".format(
                    word=word,
                    silence=args.silence_phones,
                    garbage=args.garbage_phones))
            garbage_phones[word] = 1

    if len(garbage_phones) == 0:
        raise RuntimeError("Could not find any phones in {garbage}"
                           "".format(garbage=args.garbage_phones))

    num_utts = 0
    num_err = 0
    targets = []
    prev_utt = ""

    with common_lib.smart_open(args.arc_info) as arc_info_reader, \
            common_lib.smart_open(args.targets_file, 'w') as targets_writer:
        for line in arc_info_reader:
            try:
                parts = line.strip().split()
                utt = parts[0]

                if utt != prev_utt:
                    if prev_utt != "":
                        if len(targets) > 0:
                            num_utts += 1
                            common_lib.write_matrix_ascii(
                                targets_writer, targets, key=prev_utt)
                        else:
                            num_err += 1
                    prev_utt = utt
                    targets = []

                start_frame = int(parts[1])
                num_frames = int(parts[2])
                post = float(parts[3])
                phone = parts[4]

                if start_frame + num_frames > len(targets):
                    for t in range(len(targets), start_frame + num_frames):
                        targets.append([0, 0, 0])
                    assert start_frame + num_frames == len(targets)

                for t in range(start_frame, start_frame + num_frames):
                    if phone in silence_phones:
                        targets[t][0] += post
                    elif num_frames > args.max_phone_length:
                        targets[t][2] += post
                    elif phone in garbage_phones:
                        targets[t][2] += post
                    else:
                        targets[t][1] += post
            except Exception:
                logger.error("Failed to process line {line} in {f}"
                             "".format(line=line.strip(), f=args.arc_info))
                logger.error("len(targets) = {l}".format(l=len(targets)))
                raise

    if prev_utt != "":
        if len(targets) > 0:
            num_utts += 1
            common_lib.write_matrix_ascii(args.targets_file, targets,
                                          key=prev_utt)
        else:
            num_err += 1

    logger.info("Wrote {num_utts} targets; failed with {num_err}"
                "".format(num_utts=num_utts, num_err=num_err))
    if num_utts == 0 or num_err >= num_utts // 2:
        raise RuntimeError
            num_frames = end_frame - start_frame
            if (num_frames <= 0):
                continue

            targets_mat[start_frame:end_frame] = np.tile(speech_vec, (num_frames,1))
            num_targets[1] += end_frame - start_frame

<<<<<<< HEAD
        num_targets[0] = reco2num_frames[reco_id] - sum(num_targets)
        # print ("{}: {}".format(reco_id, num_targets))
=======

        num_targets[0] = reco2num_frames[reco_id] - sum(num_targets)
>>>>>>> upstream/master
        reco2targets[reco_id] = targets_mat

    with common_lib.smart_open(args.out_targets_ark, 'w') as f:
        for reco_id in sorted(reco2targets.keys()):
            common_lib.write_matrix_ascii(f, reco2targets[reco_id].tolist(), key=reco_id)

def main():
    args = get_args()
    try:
        run(args)
    except Exception:
        raise

if __name__ == "__main__":
    main()

def run(args):
    reco2utt = {}
    with common_lib.smart_open(args.reco2utt) as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) < 2:
                raise ValueError("Could not parse line {0}".format(line))
            reco2utt[parts[0]] = parts[1:]

    reco2num_frames = {}
    with common_lib.smart_open(args.reco2num_frames) as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) != 2:
                raise ValueError("Could not parse line {0}".format(line))
            if parts[0] not in reco2utt:
                continue
            reco2num_frames[parts[0]] = int(parts[1])

    segments = {}
    with common_lib.smart_open(args.segments) as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) not in [4, 5]:
                raise ValueError("Could not parse line {0}".format(line))
            utt = parts[0]
            reco = parts[1]
            if reco not in reco2utt:
                continue
            start_time = float(parts[2])
            end_time = float(parts[3])
            segments[utt] = [reco, start_time, end_time]

    num_utt_err = 0
    num_utt = 0
    num_reco = 0

    if args.default_targets is not None:
        default_targets = np.matrix(common_lib.read_matrix_ascii(args.default_targets))
    else:
        default_targets = np.matrix([[1, 0, 0]])
    assert (np.shape(default_targets)[0] == 1
            and np.shape(default_targets)[1] == 3)

    with common_lib.smart_open(args.out_targets_ark, 'w') as f:
        for reco, utts in reco2utt.iteritems():
            reco_mat = np.repeat(default_targets, reco2num_frames[reco],
                                 axis=0)
            utts.sort(key=lambda x: segments[x][1])   # sort on start time
            for i, utt in enumerate(utts):
                if utt not in segments:
                    num_utt_err += 1
                    continue
                segment = segments[utt]

                start_frame = int(segment[1] / args.frame_shift)
                end_frame = int(segment[2] / args.frame_shift)
                num_frames = end_frame - start_frame

                if end_frame > reco2num_frames[reco]:
                    end_frame = reco2num_frames[reco]
                    num_frames = end_frame - start_frame

                reco_mat[start_frame:end_frame] = np.zeros([num_frames, 3])
                num_utt += 1

            if reco_mat.shape[0] > 0:
                common_lib.write_matrix_ascii(f, reco_mat.tolist(),
                                              key=reco)
                num_reco += 1

    logger.info("Got default out-of-segment targets for {num_reco} recordings "
                "containing {num_utt} in-segment regions; "
                "failed to account {num_utt_err} utterances"
                "".format(num_reco=num_reco, num_utt=num_utt,
                          num_utt_err=num_utt_err))

    if num_utt == 0 or num_utt_err > num_utt / 2 or num_reco == 0:
        raise RuntimeError
def WriteDistMatrices(D, wark):
    with common_lib.smart_open(wark, 'w') as f:
        for id in sorted(D.keys()):
            common_lib.write_matrix_ascii(f, D[id].tolist(), key=id)
def run(args):
    reco2utt = {}
    with common_lib.smart_open(args.reco2utt) as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) < 2:
                raise ValueError("Could not parse line {0}".format(line))
            reco2utt[parts[0]] = parts[1:]

    reco2num_frames = {}
    with common_lib.smart_open(args.reco2num_frames) as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) != 2:
                raise ValueError("Could not parse line {0}".format(line))
            if parts[0] not in reco2utt:
                continue
            reco2num_frames[parts[0]] = int(parts[1])

    segments = {}
    with common_lib.smart_open(args.segments) as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) not in [4, 5]:
                raise ValueError("Could not parse line {0}".format(line))
            utt = parts[0]
            reco = parts[1]
            if reco not in reco2utt:
                continue
            start_time = float(parts[2])
            end_time = float(parts[3])
            segments[utt] = [reco, start_time, end_time]

    num_utt_err = 0
    num_utt = 0
    num_reco = 0

    if args.default_targets is not None:
        default_targets = np.matrix(
            common_lib.read_matrix_ascii(args.default_targets))
    else:
        default_targets = np.matrix([[1, 0, 0]])
    assert (np.shape(default_targets)[0] == 1
            and np.shape(default_targets)[1] == 3)

    with common_lib.smart_open(args.out_targets_ark, 'w') as f:
        for reco, utts in reco2utt.iteritems():
            reco_mat = np.repeat(default_targets,
                                 reco2num_frames[reco],
                                 axis=0)
            utts.sort(key=lambda x: segments[x][1])  # sort on start time
            for i, utt in enumerate(utts):
                if utt not in segments:
                    num_utt_err += 1
                    continue
                segment = segments[utt]

                start_frame = int(segment[1] / args.frame_shift)
                end_frame = int(segment[2] / args.frame_shift)
                num_frames = end_frame - start_frame

                if end_frame > reco2num_frames[reco]:
                    end_frame = reco2num_frames[reco]
                    num_frames = end_frame - start_frame

                reco_mat[start_frame:end_frame] = np.zeros([num_frames, 3])
                num_utt += 1

            if reco_mat.shape[0] > 0:
                common_lib.write_matrix_ascii(f, reco_mat.tolist(), key=reco)
                num_reco += 1

    logger.info("Got default out-of-segment targets for {num_reco} recordings "
                "containing {num_utt} in-segment regions; "
                "failed to account {num_utt_err} utterances"
                "".format(num_reco=num_reco,
                          num_utt=num_utt,
                          num_utt_err=num_utt_err))

    if num_utt == 0 or num_utt_err > num_utt // 2 or num_reco == 0:
        raise RuntimeError
def run(args):
    # Get all reco to num_frames, which will be used to decide the number of
    # rows of matrix
    reco2num_frames = {}
    with common_lib.smart_open(args.reco2num_frames) as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) != 2:
                raise ValueError("Could not parse line {0}".format(line))
            reco2num_frames[parts[0]] = int(parts[1])

    # We read all segments and store as a list of objects
    segments = []
    with common_lib.smart_open(args.overlap_rttm) as f:
        for line in f.readlines():
            parts = line.strip().split()
            segments.append(
                Segment(parts[1],
                        float(parts[3]),
                        dur=float(parts[4]),
                        label=parts[7]))

    # We group the segment list into a dictionary indexed by reco_id
    reco2segs = defaultdict(list, {
        reco_id: list(g)
        for reco_id, g in groupby(segments, lambda x: x.reco_id)
    })

    # Now, for each reco, create a matrix of shape num_frames x 3 and fill in using
    # the segments information for that reco
    reco2targets = {}
    for reco_id in reco2num_frames:
        segs = sorted(reco2segs[reco_id], key=lambda x: x.start_time)

        target_val = 1 - args.label_smoothing
        other_val = args.label_smoothing / 2
        silence_vec = np.array([target_val, other_val, other_val],
                               dtype=np.float)
        single_vec = np.array([other_val, target_val, other_val],
                              dtype=np.float)
        overlap_vec = np.array([other_val, other_val, target_val],
                               dtype=np.float)
        num_targets = [0, 0, 0]

        # The default target (if not single or overlap) is silence
        targets_mat = np.tile(silence_vec, (reco2num_frames[reco_id], 1))

        # Now iterate over all segments of the recording and assign targets
        for seg in segs:
            start_frame = int(seg.start_time / args.frame_shift)
            end_frame = min(int(seg.end_time / args.frame_shift),
                            reco2num_frames[reco_id])
            num_frames = end_frame - start_frame
            if (num_frames <= 0):
                continue
            if (seg.label == "overlap"):
                targets_mat[start_frame:end_frame] = np.tile(
                    overlap_vec, (num_frames, 1))
                num_targets[2] += end_frame - start_frame
            else:
                targets_mat[start_frame:end_frame] = np.tile(
                    single_vec, (num_frames, 1))
                num_targets[1] += end_frame - start_frame

        num_targets[0] = reco2num_frames[reco_id] - sum(num_targets)
        # print ("{}: {}".format(reco_id, num_targets))
        reco2targets[reco_id] = targets_mat

    with common_lib.smart_open(args.out_targets_ark, 'w') as f:
        for reco_id in sorted(reco2targets.keys()):
            common_lib.write_matrix_ascii(f,
                                          reco2targets[reco_id].tolist(),
                                          key=reco_id)
def run(args):
    reco2utt = read_reco2utt_file(args.reco2utt)
    reco2num_frames = read_reco2num_frames_file(args.reco2num_frames)
    segments = read_segments_file(args.segments, reco2utt)
    targets = read_targets_scp(args.targets_scp, segments)

    if args.default_targets is not None:
        # Read the vector of default targets for out-of-segment regions
        default_targets = np.matrix(
            common_lib.read_matrix_ascii(args.default_targets))
    else:
        default_targets = np.zeros([1, 3])
    assert (np.shape(default_targets)[0] == 1
            and np.shape(default_targets)[1] == 3)

    num_utt_err = 0
    num_utt = 0
    num_reco = 0

    with common_lib.smart_open(args.out_targets_ark, 'w') as fh:
        for reco, utts in reco2utt.iteritems():
            # Read a recording and the list of its utterances from the
            # reco2utt dictionary
            reco_mat = np.repeat(default_targets, reco2num_frames[reco],
                                 axis=0)
            utts.sort(key=lambda x: segments[x][1])   # sort on start time

            for i, utt in enumerate(utts):
                if utt not in segments or utt not in targets:
                    num_utt_err += 1
                    continue
                segment = segments[utt]

                # Read the targets corresponding to the segments
                cmd = ("copy-feats --binary=false {mat_fn} -"
                       "".format(mat_fn=targets[utt]))
                p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE,
                                     stderr=subprocess.PIPE)

                try:
                    mat = np.matrix(common_lib.read_matrix_ascii(p.stdout),
                                    dtype='float32')
                except Exception:
                    logger.error("Command '{cmd}' failed".format(cmd=cmd))
                    raise
                finally:
                    [stdout, stderr] = p.communicate()
                    if p.returncode is not None and p.returncode != 0:
                        raise RuntimeError(
                            'Command "{cmd}" failed with status {status}; '
                            'stderr = {stderr}'.format(cmd=cmd, status=-p.returncode,
                                                       stderr=stderr))

                start_frame = int(segment[1] / args.frame_shift + 0.5)
                end_frame = int(segment[2] / args.frame_shift + 0.5)
                num_frames = end_frame - start_frame

                if num_frames <= 0:
                    raise ValueError("Invalid line in segments file {0}"
                                     "".format(segment))

                if abs(mat.shape[0] - num_frames) > args.length_tolerance:
                    logger.warning("For utterance {utt}, mismatch in segment "
                                   "length and targets matrix size; "
                                   "{s_len} vs {t_len}".format(
                                       utt=utt, s_len=num_frames,
                                       t_len=mat.shape[0]))
                    num_utt_err += 1
                    continue

                if end_frame > reco2num_frames[reco]:
                    end_frame = reco2num_frames[reco]
                    num_frames = end_frame - start_frame

                if num_frames < 0:
                    logger.warning("For utterance {utt}, start-frame {start} "
                                   "is outside the recording"
                                   "".format(utt=utt, start=start_frame))
                    num_utt_err += 1
                    continue

                prev_utt_end_frame = (
                    int(segments[utts[i-1]][2] / args.frame_shift + 0.5)
                    if i > 0 else 0)
                if start_frame < prev_utt_end_frame:
                    # Segment overlaps with the previous utterance
                    # Combine targets using a weighted interpolation using a
                    # triangular window with a weight of 1 at the start/end of
                    # overlap and 0 at the end/start of the segment
                    for n in range(0, prev_utt_end_frame - start_frame):
                        w = float(n) / float(prev_utt_end_frame - start_frame)
                        reco_mat[n + start_frame, :] = (
                            reco_mat[n + start_frame, :] * (1.0 - w)
                            + mat[n, :] * w)

                    num_frames = min(num_frames, mat.shape[0])
                    end_frame = start_frame + num_frames
                    reco_mat[prev_utt_end_frame:end_frame, :] = (
                        mat[(prev_utt_end_frame-start_frame):
                            (end_frame-start_frame), :])
                else:
                    # No overlap with the previous utterances.
                    # So just add it to the output.
                    num_frames = min(num_frames, mat.shape[0])
                    reco_mat[start_frame:(start_frame + num_frames), :] = (
                        mat[0:num_frames, :])
                logger.debug("reco_mat shape = %s, mat shape = %s, "
                             "start_frame = %d, end_frame = %d", reco_mat.shape,
                             mat.shape, start_frame, end_frame)
                num_utt += 1

            if reco_mat.shape[0] > 0:
                common_lib.write_matrix_ascii(fh, reco_mat,
                                              key=reco)
                num_reco += 1

    logger.info("Merged {num_utt} segment targets from {num_reco} recordings; "
                "failed with {num_utt_err} utterances"
                "".format(num_utt=num_utt, num_reco=num_reco,
                          num_utt_err=num_utt_err))

    if num_utt == 0 or num_utt_err > num_utt / 2 or num_reco == 0:
        raise RuntimeError