def run(args): """The main function that does everything.""" utt2dur = {} if args.utt2dur is not None: with common_lib.smart_open(args.utt2dur) as utt2dur_fh: for line in utt2dur_fh: parts = line.strip().split() if len(parts) != 2: raise RuntimeError("Unable to parse line '{0}' in {1}" "".format(line.strip(), args.utt2dur)) utt2dur[parts[0]] = float(parts[1]) global_stats = SegmenterStats() with common_lib.smart_open(args.in_sad) as in_sad_fh, \ common_lib.smart_open(args.out_segments, 'w') as out_segments_fh: for line in in_sad_fh: parts = line.strip().split() utt_id = parts[0] if len(parts) < 2: raise RuntimeError("Unable to parse line '{0}' in {1}" "".format(line.strip(), in_sad_fh)) segmentation = Segmentation() segmentation.initialize_segments( parts[1:], args.frame_shift) segmentation.pad_speech_segments(args.segment_padding, None if args.utt2dur is None else utt2dur[utt_id]) segmentation.write(utt_id, out_segments_fh) global_stats.add(segmentation.stats) logger.info(global_stats)
def run(args): """The main function that does everything.""" utt2dur = {} if args.utt2dur is not None: with common_lib.smart_open(args.utt2dur) as utt2dur_fh: for line in utt2dur_fh: parts = line.strip().split() if len(parts) != 2: raise RuntimeError("Unable to parse line '{0}' in {1}" "".format(line.strip(), args.utt2dur)) utt2dur[parts[0]] = float(parts[1]) global_stats = SegmenterStats() with common_lib.smart_open(args.in_sad) as in_sad_fh, \ common_lib.smart_open(args.out_segments, 'w') as out_segments_fh: for line in in_sad_fh: parts = line.strip().split() utt_id = parts[0] if len(parts) < 2: raise RuntimeError("Unable to parse line '{0}' in {1}" "".format(line.strip(), in_sad_fh)) segmentation = Segmentation() segmentation.initialize_segments(parts[1:], args.frame_shift) segmentation.pad_speech_segments( args.segment_padding, None if args.utt2dur is None else utt2dur[utt_id]) segmentation.write(utt_id, out_segments_fh) global_stats.add(segmentation.stats) logger.info(global_stats)
def run(args): num_done = 0 with common_lib.smart_open( args.pasted_targets) as targets_reader, common_lib.smart_open( args.out_targets, "w") as targets_writer: for key, mat in common_lib.read_mat_ark(targets_reader): mat = np.matrix(mat) if mat.shape[1] % args.dim != 0: raise RuntimeError( "For utterance {utt} in {f}, num-columns {nc} " "is not a multiple of dim {dim}" "".format( utt=key, f=args.pasted_targets.name, nc=mat.shape[1], dim=args.dim, )) num_sources = mat.shape[1] // args.dim out_mat = np.matrix(np.zeros([mat.shape[0], args.dim])) if args.remove_mismatch_frames: for n in range(mat.shape[0]): if should_remove_frame(mat[n, :].getA()[0], args.dim): out_mat[n, :] = np.zeros([1, args.dim]) else: for i in range(num_sources): out_mat[n, :] += mat[n, (i * args.dim):( (i + 1) * args.dim)] * (1.0 if args.weights is None else args.weights[i]) else: # Just interpolate the targets for i in range(num_sources): out_mat += mat[:, (i * args.dim):((i + 1) * args.dim)] * ( 1.0 if args.weights is None else args.weights[i]) common_lib.write_matrix_ascii(targets_writer, out_mat.tolist(), key=key) num_done += 1 logger.info("Merged {num_done} target matrices" "".format(num_done=num_done)) if num_done == 0: raise RuntimeError
def load_rttm(file): recoid_dict = {} with common_lib.smart_open(file, 'r') as f: for line in f: parts = line.strip().split() sessionid_arrayid = parts[1] sessionid = sessionid_arrayid.split('_')[0] reference = sessionid_arrayid.split('_')[-1] start_time = float(parts[3]) start_time_td = timedelta(seconds=start_time) time = str(start_time_td).split(':') hrs, mins, secs = time[0], time[1], float(time[2]) secs1 = "{0:.2f}".format(secs) start_time_str = str(hrs) + ':' + str(mins) + ':' + str(secs1) end_time = start_time + float(parts[4]) end_time_td = str(timedelta(seconds=end_time)) time = str(end_time_td).split(':') hrs, mins, secs = time[0], time[1], float(time[2]) secs1 = "{0:.2f}".format(secs) end_time_str = str(hrs) + ':' + str(mins) + ':' + str(secs1) spkr = parts[7] st = int(start_time * 100) end = int(end_time * 100) utt = "{0}-{1:06d}-{2:06d}".format(spkr, st, end) recoid_dict[utt] = (end_time_str, start_time_str, "NA", spkr, reference, "NA", sessionid) return recoid_dict
def main(): try: args = get_args() with common_lib.smart_open(args.output_graph, 'w') as f: print_states(args, f) except Exception: raise
def main(): try: args = get_args() with common_lib.smart_open(args.output_graph, 'w') as f: print_states(args, f) except Exception: raise
def main(): args = get_args() if args.reco2file_and_channel is not None: reco2file_and_channel = {} with common_lib.smart_open(args.reco2file_and_channel) as fh: for line in fh: parts = line.strip().split() reco2file_and_channel[parts[0]] = (parts[1], parts[2]) utt2spk = {} with common_lib.smart_open(args.utt2spk) as fh: for line in fh: parts = line.strip().split() utt2spk[parts[0]] = parts[1] with common_lib.smart_open( args.segments) as segments_reader, common_lib.smart_open( args.rttm_file, "w") as rttm_writer: for line in segments_reader: parts = line.strip().split() utt = parts[0] spkr = utt2spk[utt] reco = parts[1] file_id = reco channel = 1 if args.reco2file_and_channel is not None: try: file_id, channel = reco2file_and_channel[reco] except KeyError: raise RuntimeError( "Could not find recording {0} in {1}".format( reco, args.reco2file_and_channel)) start_time = float(parts[2]) duration = float(parts[3]) - start_time print( "SPEAKER {0} {1} {2:7.2f} {3:7.2f} " "<NA> <NA> {4} <NA>".format(file_id, channel, start_time, duration, spkr), file=rttm_writer, )
def run(args): num_done = 0 with common_lib.smart_open(args.pasted_targets) as targets_reader, \ common_lib.smart_open(args.out_targets, 'w') as targets_writer: for key, mat in common_lib.read_mat_ark(targets_reader): mat = np.matrix(mat) if mat.shape[1] % args.dim != 0: raise RuntimeError( "For utterance {utt} in {f}, num-columns {nc} " "is not a multiple of dim {dim}" "".format(utt=key, f=args.pasted_targets.name, nc=mat.shape[1], dim=args.dim)) num_sources = mat.shape[1] // args.dim out_mat = np.matrix(np.zeros([mat.shape[0], args.dim])) if args.remove_mismatch_frames: for n in range(mat.shape[0]): if should_remove_frame(mat[n, :].getA()[0], args.dim): out_mat[n, :] = np.zeros([1, args.dim]) else: for i in range(num_sources): out_mat[n, :] += ( mat[n, (i * args.dim) : ((i+1) * args.dim)] * (1.0 if args.weights is None else args.weights[i])) else: # Just interpolate the targets for i in range(num_sources): out_mat += ( mat[:, (i * args.dim) : ((i+1) * args.dim)] * (1.0 if args.weights is None else args.weights[i])) common_lib.write_matrix_ascii(targets_writer, out_mat.tolist(), key=key) num_done += 1 logger.info("Merged {num_done} target matrices" "".format(num_done=num_done)) if num_done == 0: raise RuntimeError
def main(): args = get_args() if args.reco2file_and_channel is not None: reco2file_and_channel = {} with common_lib.smart_open(args.reco2file_and_channel) as fh: for line in fh: parts = line.strip().split() reco2file_and_channel[parts[0]] = (parts[1], parts[2]) utt2spk = {} with common_lib.smart_open(args.utt2spk) as fh: for line in fh: parts = line.strip().split() utt2spk[parts[0]] = parts[1] with common_lib.smart_open(args.segments) as segments_reader, \ common_lib.smart_open(args.rttm_file, 'w') as rttm_writer: for line in segments_reader: parts = line.strip().split() utt = parts[0] spkr = utt2spk[utt] reco = parts[1] file_id = reco channel = 1 if args.reco2file_and_channel is not None: try: file_id, channel = reco2file_and_channel[reco] except KeyError: raise RuntimeError( "Could not find recording {0} in {1}".format( reco, args.reco2file_and_channel)) start_time = float(parts[2]) duration = float(parts[3]) - start_time print("SPEAKER {0} {1} {2:7.2f} {3:7.2f} " "<NA> <NA> {4} <NA>\n".format( file_id, channel, start_time, duration, spkr), file=rttm_writer)
def read_reco2utt_file(reco2utt_file): # Read reco2utt file reco2utt = {} with common_lib.smart_open(reco2utt_file) as fh: for line in fh: parts = line.strip().split() if len(parts) < 2: raise ValueError("Could not parse line {0} in reco2utt " "file {1}".format(line, reco2utt_file)) reco2utt[parts[0]] = parts[1:] return reco2utt
def read_reco2utt_file(reco2utt_file): # Read reco2utt file reco2utt = {} with common_lib.smart_open(reco2utt_file) as fh: for line in fh: parts = line.strip().split() if len(parts) < 2: raise ValueError("Could not parse line {0} in reco2utt " "file {1}".format(line, reco2utt_file)) reco2utt[parts[0]] = parts[1:] return reco2utt
def read_reco2num_frames_file(reco2num_frames_file): # Read reco2num_frames file reco2num_frames = {} with common_lib.smart_open(reco2num_frames_file) as fh: for line in fh: parts = line.strip().split() if len(parts) != 2: raise ValueError("Could not parse line {0} in " "reco2num-frames file {1}".format( line, reco2num_frames_file)) reco2num_frames[parts[0]] = int(parts[1]) return reco2num_frames
def read_reco2num_frames_file(reco2num_frames_file): # Read reco2num_frames file reco2num_frames = {} with common_lib.smart_open(reco2num_frames_file) as fh: for line in fh: parts = line.strip().split() if len(parts) != 2: raise ValueError("Could not parse line {0} in " "reco2num-frames file {1}".format( line, reco2num_frames_file)) reco2num_frames[parts[0]] = int(parts[1]) return reco2num_frames
def load_rttm(rttmf): with common_lib.smart_open(rttmf, 'r') as f: turns = [] speaker_ids = set() file_ids = set() for line in f: if line.startswith('SPKR-INFO'): continue turn = _parse_rttm_line(line) turns.append(turn) speaker_ids.add(turn.speaker_id) file_ids.add(turn.file_id) return turns, speaker_ids, file_ids
def read_targets_scp(targets_scp, segments): # Read the SCP file containing targets targets = {} with common_lib.smart_open(targets_scp) as fh: for line in fh: parts = line.strip().split() if len(parts) != 2: raise ValueError("Could not parse line {0} in " "targets scp file".format(line, targets_scp)) utt = parts[0] if utt not in segments: continue targets[utt] = parts[1] return targets
def read_targets_scp(targets_scp, segments): # Read the SCP file containing targets targets = {} with common_lib.smart_open(targets_scp) as fh: for line in fh: parts = line.strip().split() if len(parts) != 2: raise ValueError("Could not parse line {0} in " "targets scp file".format(line, targets_scp)) utt = parts[0] if utt not in segments: continue targets[utt] = parts[1] return targets
def write_rttm(rttmf, turns, n_digits=3): with common_lib.smart_open(rttmf, 'w') as f: for turn in sorted(turns, key=lambda x:x.onset): fields = ['SPEAKER', turn.file_id, '1', format_float(turn.onset, n_digits), format_float(turn.dur, n_digits), '<NA>', '<NA>', turn.speaker_id, '<NA>', '<NA>'] line = ' '.join(fields) f.write(line + '\n')
def read_segments_file(segments_file, reco2utt): # Read segments from segments file segments = {} with common_lib.smart_open(segments_file) as fh: for line in fh: parts = line.strip().split() if len(parts) not in [4, 5]: raise ValueError("Could not parse line {0} in " "segments file {1}".format(line, segments)) utt = parts[0] reco = parts[1] if reco not in reco2utt: continue start_time = float(parts[2]) end_time = float(parts[3]) segments[utt] = [reco, start_time, end_time] return segments
def read_segments_file(segments_file, reco2utt): # Read segments from segments file segments = {} with common_lib.smart_open(segments_file) as fh: for line in fh: parts = line.strip().split() if len(parts) not in [4, 5]: raise ValueError("Could not parse line {0} in " "segments file {1}".format(line, segments)) utt = parts[0] reco = parts[1] if reco not in reco2utt: continue start_time = float(parts[2]) end_time = float(parts[3]) segments[utt] = [reco, start_time, end_time] return segments
for line in f: <<<<<<< HEAD parts = line.strip().split() if len(parts) != 2: raise ValueError("Could not parse line {0}".format(line)) reco2num_frames[parts[0]] = int(parts[1]) ======= fields = line.strip().split() if len(fields) != 2: raise ValueError("Could not parse line {0}".format(line)) reco2num_frames[fields[0]] = int(fields[1]) >>>>>>> upstream/master # We read all segments and store as a list of objects segments = [] with common_lib.smart_open(args.rttm) as f: for line in f.readlines(): <<<<<<< HEAD fields = line.strip().split() segments.append(Segment(fields[1], float(fields[3]), dur=float(fields[4]), label=fields[7])) # We group the segment list into a dictionary indexed by reco_id reco2segs = defaultdict(list, {reco_id : list(g) for reco_id, g in groupby(segments, lambda x: x.reco_id)}) # Now, for each reco, create a matrix of shape num_frames x 3 and fill in using # the segments information for that reco reco2targets = {} for reco_id in reco2num_frames: segs = sorted(reco2segs[reco_id], key=lambda x: x.start_time) =======
def run(args): reco2utt = {} with common_lib.smart_open(args.reco2utt) as f: for line in f: parts = line.strip().split() if len(parts) < 2: raise ValueError("Could not parse line {0}".format(line)) reco2utt[parts[0]] = parts[1:] reco2num_frames = {} with common_lib.smart_open(args.reco2num_frames) as f: for line in f: parts = line.strip().split() if len(parts) != 2: raise ValueError("Could not parse line {0}".format(line)) if parts[0] not in reco2utt: continue reco2num_frames[parts[0]] = int(parts[1]) segments = {} with common_lib.smart_open(args.segments) as f: for line in f: parts = line.strip().split() if len(parts) not in [4, 5]: raise ValueError("Could not parse line {0}".format(line)) utt = parts[0] reco = parts[1] if reco not in reco2utt: continue start_time = float(parts[2]) end_time = float(parts[3]) segments[utt] = [reco, start_time, end_time] num_utt_err = 0 num_utt = 0 num_reco = 0 if args.default_targets is not None: default_targets = np.matrix( common_lib.read_matrix_ascii(args.default_targets)) else: default_targets = np.matrix([[1, 0, 0]]) assert (np.shape(default_targets)[0] == 1 and np.shape(default_targets)[1] == 3) with common_lib.smart_open(args.out_targets_ark, 'w') as f: for reco, utts in reco2utt.iteritems(): reco_mat = np.repeat(default_targets, reco2num_frames[reco], axis=0) utts.sort(key=lambda x: segments[x][1]) # sort on start time for i, utt in enumerate(utts): if utt not in segments: num_utt_err += 1 continue segment = segments[utt] start_frame = int(segment[1] / args.frame_shift) end_frame = int(segment[2] / args.frame_shift) num_frames = end_frame - start_frame if end_frame > reco2num_frames[reco]: end_frame = reco2num_frames[reco] num_frames = end_frame - start_frame reco_mat[start_frame:end_frame] = np.zeros([num_frames, 3]) num_utt += 1 if reco_mat.shape[0] > 0: common_lib.write_matrix_ascii(f, reco_mat.tolist(), key=reco) num_reco += 1 logger.info("Got default out-of-segment targets for {num_reco} recordings " "containing {num_utt} in-segment regions; " "failed to account {num_utt_err} utterances" "".format(num_reco=num_reco, num_utt=num_utt, num_utt_err=num_utt_err)) if num_utt == 0 or num_utt_err > num_utt // 2 or num_reco == 0: raise RuntimeError
def run(args): silence_phones = {} with common_lib.smart_open(args.silence_phones) as silence_phones_fh: for line in silence_phones_fh: silence_phones[line.strip().split()[0]] = 1 if len(silence_phones) == 0: raise RuntimeError("Could not find any phones in {silence}" "".format(silence=args.silence_phones)) garbage_phones = {} with common_lib.smart_open(args.garbage_phones) as garbage_phones_fh: for line in garbage_phones_fh: word = line.strip().split()[0] if word in silence_phones: raise RuntimeError("Word '{word}' is in both {silence} " "and {garbage}".format( word=word, silence=args.silence_phones, garbage=args.garbage_phones)) garbage_phones[word] = 1 if len(garbage_phones) == 0: raise RuntimeError("Could not find any phones in {garbage}" "".format(garbage=args.garbage_phones)) num_utts = 0 num_err = 0 targets = [] prev_utt = "" with common_lib.smart_open(args.arc_info) as arc_info_reader, \ common_lib.smart_open(args.targets_file, 'w') as targets_writer: for line in arc_info_reader: try: parts = line.strip().split() utt = parts[0] if utt != prev_utt: if prev_utt != "": if len(targets) > 0: num_utts += 1 common_lib.write_matrix_ascii( targets_writer, targets, key=prev_utt) else: num_err += 1 prev_utt = utt targets = [] start_frame = int(parts[1]) num_frames = int(parts[2]) post = float(parts[3]) phone = parts[4] if start_frame + num_frames > len(targets): for t in range(len(targets), start_frame + num_frames): targets.append([0, 0, 0]) assert start_frame + num_frames == len(targets) for t in range(start_frame, start_frame + num_frames): if phone in silence_phones: targets[t][0] += post elif num_frames > args.max_phone_length: targets[t][2] += post elif phone in garbage_phones: targets[t][2] += post else: targets[t][1] += post except Exception: logger.error("Failed to process line {line} in {f}" "".format(line=line.strip(), f=args.arc_info)) logger.error("len(targets) = {l}".format(l=len(targets))) raise if prev_utt != "": if len(targets) > 0: num_utts += 1 common_lib.write_matrix_ascii(args.targets_file, targets, key=prev_utt) else: num_err += 1 logger.info("Wrote {num_utts} targets; failed with {num_err}" "".format(num_utts=num_utts, num_err=num_err)) if num_utts == 0 or num_err >= num_utts / 2: raise RuntimeError
def run(args): # Get all reco to num_frames, which will be used to decide the number of # rows of matrix reco2num_frames = {} with common_lib.smart_open(args.reco2num_frames) as f: for line in f: fields = line.strip().split() if len(fields) != 2: raise ValueError("Could not parse line {0}".format(line)) reco2num_frames[fields[0]] = int(fields[1]) # We read all segments and store as a list of objects segments = [] with common_lib.smart_open(args.rttm) as f: for line in f.readlines(): segment_fields = line.strip().split() start = float(segment_fields[3]) duration = float(segment_fields[4]) end = start + duration segments.append( Segment(reco=segment_fields[1], spk=segment_fields[7], start=start, dur=duration, end=end)) keyfunc = lambda x: x.reco segments_iterable = sorted(segments, key=keyfunc) reco2segs = defaultdict(list, { reco: list(g) for reco, g in itertools.groupby(segments_iterable, keyfunc) }) # Now, for each reco, create a matrix of shape num_frames x 2 and fill in using # the segments information for that reco reco2targets = {} for reco_id in reco2num_frames: segs = sorted(reco2segs[reco_id], key=lambda x: x.start) target_val = 1 - args.label_smoothing other_val = args.label_smoothing / 2 silence_vec = np.array([target_val, other_val], dtype=np.float) speech_vec = np.array([other_val, target_val], dtype=np.float) num_targets = [0, 0] # The default target (if not speech) is silence targets_mat = np.tile(silence_vec, (reco2num_frames[reco_id], 1)) # Now iterate over all segments of the recording and assign targets for seg in segs: start_frame = int(seg.start / args.frame_shift) end_frame = min(int(seg.end / args.frame_shift), reco2num_frames[reco_id]) num_frames = end_frame - start_frame if (num_frames <= 0): continue targets_mat[start_frame:end_frame] = np.tile( speech_vec, (num_frames, 1)) num_targets[1] += end_frame - start_frame num_targets[0] = reco2num_frames[reco_id] - sum(num_targets) reco2targets[reco_id] = targets_mat with common_lib.smart_open(args.out_targets_ark, 'w') as f: for reco_id in sorted(reco2targets.keys()): common_lib.write_matrix_ascii(f, reco2targets[reco_id].tolist(), key=reco_id)
def run(args): reco2utt = read_reco2utt_file(args.reco2utt) reco2num_frames = read_reco2num_frames_file(args.reco2num_frames) segments = read_segments_file(args.segments, reco2utt) targets = read_targets_scp(args.targets_scp, segments) if args.default_targets is not None: # Read the vector of default targets for out-of-segment regions default_targets = np.matrix( common_lib.read_matrix_ascii(args.default_targets)) else: default_targets = np.zeros([1, 3]) assert (np.shape(default_targets)[0] == 1 and np.shape(default_targets)[1] == 3) num_utt_err = 0 num_utt = 0 num_reco = 0 with common_lib.smart_open(args.out_targets_ark, 'w') as fh: for reco, utts in reco2utt.iteritems(): # Read a recording and the list of its utterances from the # reco2utt dictionary reco_mat = np.repeat(default_targets, reco2num_frames[reco], axis=0) utts.sort(key=lambda x: segments[x][1]) # sort on start time for i, utt in enumerate(utts): if utt not in segments or utt not in targets: num_utt_err += 1 continue segment = segments[utt] # Read the targets corresponding to the segments cmd = ("copy-feats --binary=false {mat_fn} -" "".format(mat_fn=targets[utt])) p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) try: mat = np.matrix(common_lib.read_matrix_ascii(p.stdout), dtype='float32') except Exception: logger.error("Command '{cmd}' failed".format(cmd=cmd)) raise finally: [stdout, stderr] = p.communicate() if p.returncode is not None and p.returncode != 0: raise RuntimeError( 'Command "{cmd}" failed with status {status}; ' 'stderr = {stderr}'.format(cmd=cmd, status=-p.returncode, stderr=stderr)) start_frame = int(segment[1] / args.frame_shift + 0.5) end_frame = int(segment[2] / args.frame_shift + 0.5) num_frames = end_frame - start_frame if num_frames <= 0: raise ValueError("Invalid line in segments file {0}" "".format(segment)) if abs(mat.shape[0] - num_frames) > args.length_tolerance: logger.warning("For utterance {utt}, mismatch in segment " "length and targets matrix size; " "{s_len} vs {t_len}".format( utt=utt, s_len=num_frames, t_len=mat.shape[0])) num_utt_err += 1 continue if end_frame > reco2num_frames[reco]: end_frame = reco2num_frames[reco] num_frames = end_frame - start_frame if num_frames < 0: logger.warning("For utterance {utt}, start-frame {start} " "is outside the recording" "".format(utt=utt, start=start_frame)) num_utt_err += 1 continue prev_utt_end_frame = ( int(segments[utts[i-1]][2] / args.frame_shift + 0.5) if i > 0 else 0) if start_frame < prev_utt_end_frame: # Segment overlaps with the previous utterance # Combine targets using a weighted interpolation using a # triangular window with a weight of 1 at the start/end of # overlap and 0 at the end/start of the segment for n in range(0, prev_utt_end_frame - start_frame): w = float(n) / float(prev_utt_end_frame - start_frame) reco_mat[n + start_frame, :] = ( reco_mat[n + start_frame, :] * (1.0 - w) + mat[n, :] * w) num_frames = min(num_frames, mat.shape[0]) end_frame = start_frame + num_frames reco_mat[prev_utt_end_frame:end_frame, :] = ( mat[(prev_utt_end_frame-start_frame): (end_frame-start_frame), :]) else: # No overlap with the previous utterances. # So just add it to the output. num_frames = min(num_frames, mat.shape[0]) reco_mat[start_frame:(start_frame + num_frames), :] = ( mat[0:num_frames, :]) logger.debug("reco_mat shape = %s, mat shape = %s, " "start_frame = %d, end_frame = %d", reco_mat.shape, mat.shape, start_frame, end_frame) num_utt += 1 if reco_mat.shape[0] > 0: common_lib.write_matrix_ascii(fh, reco_mat, key=reco) num_reco += 1 logger.info("Merged {num_utt} segment targets from {num_reco} recordings; " "failed with {num_utt_err} utterances" "".format(num_utt=num_utt, num_reco=num_reco, num_utt_err=num_utt_err)) if num_utt == 0 or num_utt_err > num_utt / 2 or num_reco == 0: raise RuntimeError
def run(args): reco2utt = read_reco2utt_file(args.reco2utt) reco2num_frames = read_reco2num_frames_file(args.reco2num_frames) segments = read_segments_file(args.segments, reco2utt) targets = read_targets_scp(args.targets_scp, segments) if args.default_targets is not None: # Read the vector of default targets for out-of-segment regions default_targets = np.matrix( common_lib.read_matrix_ascii(args.default_targets)) else: default_targets = np.zeros([1, 3]) assert np.shape(default_targets)[0] == 1 and np.shape( default_targets)[1] == 3 num_utt_err = 0 num_utt = 0 num_reco = 0 with common_lib.smart_open(args.out_targets_ark, "w") as fh: for reco, utts in reco2utt.items(): # Read a recording and the list of its utterances from the # reco2utt dictionary reco_mat = np.repeat(default_targets, reco2num_frames[reco], axis=0) utts.sort(key=lambda x: segments[x][1]) # sort on start time end_frame_accounted = 0 for i, utt in enumerate(utts): if utt not in segments or utt not in targets: num_utt_err += 1 continue segment = segments[utt] # Read the targets corresponding to the segments cmd = "copy-feats --binary=false {mat_fn} -" "".format( mat_fn=targets[utt]) p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) try: mat = np.matrix(common_lib.read_matrix_ascii(p.stdout), dtype="float32") except Exception: logger.error("Command '{cmd}' failed".format(cmd=cmd)) raise finally: [stdout, stderr] = p.communicate() if p.returncode is not None and p.returncode != 0: raise RuntimeError( 'Command "{cmd}" failed with status {status}; ' "stderr = {stderr}".format(cmd=cmd, status=-p.returncode, stderr=stderr)) start_frame = int(segment[1] / args.frame_shift + 0.5) end_frame = int(segment[2] / args.frame_shift + 0.5) num_frames = end_frame - start_frame if num_frames <= 0: raise ValueError("Invalid line in segments file {0}" "".format(segment)) if abs(mat.shape[0] - num_frames) > args.length_tolerance: logger.warning("For utterance {utt}, mismatch in segment " "length and targets matrix size; " "{s_len} vs {t_len}".format( utt=utt, s_len=num_frames, t_len=mat.shape[0])) num_utt_err += 1 continue # Fix end_frame and num_frames if the segment goes beyond # the length of the recording. if end_frame > reco2num_frames[reco]: end_frame = reco2num_frames[reco] num_frames = end_frame - start_frame # Fix "num_frames" and "end_frame" if "num_frames" is lower # than the size of the targets matrix "mat" num_frames = min(num_frames, mat.shape[0]) end_frame = start_frame + num_frames if num_frames <= 0: logger.warning("For utterance {utt}, start-frame {start} " "is outside the recording" "".format(utt=utt, start=start_frame)) num_utt_err += 1 continue if end_frame < end_frame_accounted: logger.warning("For utterance {utt}, end-frame {end} " "is before the end of a previous segment. " "i.e. this segment is completely within " "another segment. Ignoring this segment." "".format(utt=utt, end=end_frame)) num_utt_err += 1 continue if start_frame < end_frame_accounted: # Segment overlaps with a previous utterance # Combine targets using a weighted interpolation using a # triangular window with a weight of 1 at the start/end of # overlap and 0 at the end/start of the segment for n in range(0, end_frame_accounted - start_frame): w = float(n) / float(end_frame_accounted - start_frame) reco_mat[n + start_frame, :] = ( reco_mat[n + start_frame, :] * (1.0 - w) + mat[n, :] * w) if end_frame > end_frame_accounted: reco_mat[end_frame_accounted:end_frame, :] = mat[( end_frame_accounted - start_frame):(end_frame - start_frame), :, ] else: # No overlap with the previous utterances. # So just add it to the output. reco_mat[start_frame:end_frame, :] = mat[0:num_frames, :] logger.debug( "reco_mat shape = %s, mat shape = %s, " "start_frame = %d, end_frame = %d", reco_mat.shape, mat.shape, start_frame, end_frame, ) end_frame_accounted = end_frame num_utt += 1 if reco_mat.shape[0] > 0: common_lib.write_matrix_ascii(fh, reco_mat, key=reco) num_reco += 1 logger.info("Merged {num_utt} segment targets from {num_reco} recordings; " "failed with {num_utt_err} utterances" "".format(num_utt=num_utt, num_reco=num_reco, num_utt_err=num_utt_err)) if num_utt == 0 or num_utt_err > num_utt // 2 or num_reco == 0: raise RuntimeError
def run(args): reco2utt = {} with common_lib.smart_open(args.reco2utt) as f: for line in f: parts = line.strip().split() if len(parts) < 2: raise ValueError("Could not parse line {0}".format(line)) reco2utt[parts[0]] = parts[1:] reco2num_frames = {} with common_lib.smart_open(args.reco2num_frames) as f: for line in f: parts = line.strip().split() if len(parts) != 2: raise ValueError("Could not parse line {0}".format(line)) if parts[0] not in reco2utt: continue reco2num_frames[parts[0]] = int(parts[1]) segments = {} with common_lib.smart_open(args.segments) as f: for line in f: parts = line.strip().split() if len(parts) not in [4, 5]: raise ValueError("Could not parse line {0}".format(line)) utt = parts[0] reco = parts[1] if reco not in reco2utt: continue start_time = float(parts[2]) end_time = float(parts[3]) segments[utt] = [reco, start_time, end_time] num_utt_err = 0 num_utt = 0 num_reco = 0 if args.default_targets is not None: default_targets = np.matrix(common_lib.read_matrix_ascii(args.default_targets)) else: default_targets = np.matrix([[1, 0, 0]]) assert (np.shape(default_targets)[0] == 1 and np.shape(default_targets)[1] == 3) with common_lib.smart_open(args.out_targets_ark, 'w') as f: for reco, utts in reco2utt.iteritems(): reco_mat = np.repeat(default_targets, reco2num_frames[reco], axis=0) utts.sort(key=lambda x: segments[x][1]) # sort on start time for i, utt in enumerate(utts): if utt not in segments: num_utt_err += 1 continue segment = segments[utt] start_frame = int(segment[1] / args.frame_shift) end_frame = int(segment[2] / args.frame_shift) num_frames = end_frame - start_frame if end_frame > reco2num_frames[reco]: end_frame = reco2num_frames[reco] num_frames = end_frame - start_frame reco_mat[start_frame:end_frame] = np.zeros([num_frames, 3]) num_utt += 1 if reco_mat.shape[0] > 0: common_lib.write_matrix_ascii(f, reco_mat.tolist(), key=reco) num_reco += 1 logger.info("Got default out-of-segment targets for {num_reco} recordings " "containing {num_utt} in-segment regions; " "failed to account {num_utt_err} utterances" "".format(num_reco=num_reco, num_utt=num_utt, num_utt_err=num_utt_err)) if num_utt == 0 or num_utt_err > num_utt / 2 or num_reco == 0: raise RuntimeError
def run(args): silence_phones = {} with common_lib.smart_open(args.silence_phones) as silence_phones_fh: for line in silence_phones_fh: silence_phones[line.strip().split()[0]] = 1 if len(silence_phones) == 0: raise RuntimeError("Could not find any phones in {silence}" "".format(silence=args.silence_phones)) garbage_phones = {} with common_lib.smart_open(args.garbage_phones) as garbage_phones_fh: for line in garbage_phones_fh: word = line.strip().split()[0] if word in silence_phones: raise RuntimeError("Word '{word}' is in both {silence} " "and {garbage}".format( word=word, silence=args.silence_phones, garbage=args.garbage_phones)) garbage_phones[word] = 1 if len(garbage_phones) == 0: raise RuntimeError("Could not find any phones in {garbage}" "".format(garbage=args.garbage_phones)) num_utts = 0 num_err = 0 targets = [] prev_utt = "" with common_lib.smart_open(args.arc_info) as arc_info_reader, \ common_lib.smart_open(args.targets_file, 'w') as targets_writer: for line in arc_info_reader: try: parts = line.strip().split() utt = parts[0] if utt != prev_utt: if prev_utt != "": if len(targets) > 0: num_utts += 1 common_lib.write_matrix_ascii( targets_writer, targets, key=prev_utt) else: num_err += 1 prev_utt = utt targets = [] start_frame = int(parts[1]) num_frames = int(parts[2]) post = float(parts[3]) phone = parts[4] if start_frame + num_frames > len(targets): for t in range(len(targets), start_frame + num_frames): targets.append([0, 0, 0]) assert start_frame + num_frames == len(targets) for t in range(start_frame, start_frame + num_frames): if phone in silence_phones: targets[t][0] += post elif num_frames > args.max_phone_length: targets[t][2] += post elif phone in garbage_phones: targets[t][2] += post else: targets[t][1] += post except Exception: logger.error("Failed to process line {line} in {f}" "".format(line=line.strip(), f=args.arc_info)) logger.error("len(targets) = {l}".format(l=len(targets))) raise if prev_utt != "": if len(targets) > 0: num_utts += 1 common_lib.write_matrix_ascii(args.targets_file, targets, key=prev_utt) else: num_err += 1 logger.info("Wrote {num_utts} targets; failed with {num_err}" "".format(num_utts=num_utts, num_err=num_err)) if num_utts == 0 or num_err >= num_utts // 2: raise RuntimeError
def run(args): # Get all reco to num_frames, which will be used to decide the number of # rows of matrix reco2num_frames = {} with common_lib.smart_open(args.reco2num_frames) as f: for line in f: parts = line.strip().split() if len(parts) != 2: raise ValueError("Could not parse line {0}".format(line)) reco2num_frames[parts[0]] = int(parts[1]) # We read all segments and store as a list of objects segments = [] with common_lib.smart_open(args.overlap_rttm) as f: for line in f.readlines(): parts = line.strip().split() segments.append( Segment(parts[1], float(parts[3]), dur=float(parts[4]), label=parts[7])) # We group the segment list into a dictionary indexed by reco_id reco2segs = defaultdict(list, { reco_id: list(g) for reco_id, g in groupby(segments, lambda x: x.reco_id) }) # Now, for each reco, create a matrix of shape num_frames x 3 and fill in using # the segments information for that reco reco2targets = {} for reco_id in reco2num_frames: segs = sorted(reco2segs[reco_id], key=lambda x: x.start_time) target_val = 1 - args.label_smoothing other_val = args.label_smoothing / 2 silence_vec = np.array([target_val, other_val, other_val], dtype=np.float) single_vec = np.array([other_val, target_val, other_val], dtype=np.float) overlap_vec = np.array([other_val, other_val, target_val], dtype=np.float) num_targets = [0, 0, 0] # The default target (if not single or overlap) is silence targets_mat = np.tile(silence_vec, (reco2num_frames[reco_id], 1)) # Now iterate over all segments of the recording and assign targets for seg in segs: start_frame = int(seg.start_time / args.frame_shift) end_frame = min(int(seg.end_time / args.frame_shift), reco2num_frames[reco_id]) num_frames = end_frame - start_frame if (num_frames <= 0): continue if (seg.label == "overlap"): targets_mat[start_frame:end_frame] = np.tile( overlap_vec, (num_frames, 1)) num_targets[2] += end_frame - start_frame else: targets_mat[start_frame:end_frame] = np.tile( single_vec, (num_frames, 1)) num_targets[1] += end_frame - start_frame num_targets[0] = reco2num_frames[reco_id] - sum(num_targets) # print ("{}: {}".format(reco_id, num_targets)) reco2targets[reco_id] = targets_mat with common_lib.smart_open(args.out_targets_ark, 'w') as f: for reco_id in sorted(reco2targets.keys()): common_lib.write_matrix_ascii(f, reco2targets[reco_id].tolist(), key=reco_id)
def WriteDistMatrices(D, wark): with common_lib.smart_open(wark, 'w') as f: for id in sorted(D.keys()): common_lib.write_matrix_ascii(f, D[id].tolist(), key=id)
def run(args): # Get all reco to num_frames, which will be used to decide the number of # rows of matrix reco2num_frames = {} with common_lib.smart_open(args.reco2num_frames) as f: for line in f: