def read_rttm(cls, filename, normalize_cluster=False, encoding="utf8"): """ Read rttm file :param filename: str input filename :param normalize_cluster: normalize the cluster by removing upper case and accents :return: a diarization object """ fic = open(filename, 'r', encoding=encoding) diarization = Diar() if not diarization._attributes.exist('gender'): diarization.add_attribut(new_attribut='gender', default='U') try: for line in fic: line = re.sub(r'\s+', ' ', line) line = line.strip() if line.startswith('#') or line.startswith(';;'): continue # split line into fields spk, show, tmp0, start_str, length, tmp1, tmp2, cluster, tmp3 = line.split() if spk == "SPEAKER": start = int(round(float(start_str) * 100, 0)) stop = start + int(round(float(length) * 100, 0)) if normalize_cluster: cluster = str2str_normalize(cluster) diarization.append(show=show, cluster=cluster, start=start, stop=stop) except Exception as e: logging.error(sys.exc_info()[0]) logging.error(line) fic.close() return diarization
def read_mdtm(cls, filename, normalize_cluster=False, encoding="utf8"): """ Read a MDTM file :param filename: the str input filename :param normalize_cluster: normalize the cluster by removing upper case and accents :return: a diarization object """ fic = open(filename, 'r', encoding=encoding) diarization = Diar() if not diarization._attributes.exist('gender'): diarization.add_attribut(new_attribut='gender', default='U') for line in fic: line = line.strip() line = re.sub(r'\s+', ' ', line) logging.debug(line) if line.startswith('#') or line.startswith(';;'): continue # split line into fields show, tmp, start_str, length, t, score, gender, cluster = line.split() start = int(round(float(start_str) * 100, 0)) stop = start + int(round(float(length) * 100, 0)) if normalize_cluster: cluster = str2str_normalize(cluster) # print(show, tmp, start, length, gender, channel, env, speaker) diarization.append(show=show, cluster=cluster, start=start, stop=stop, gender=gender) fic.close() return diarization
def read_ctm(cls, filename, normalize_cluster=False, encoding="utf8"): """ Read a segmentation file :param filename: the str input filename :param normalize_cluster: normalize the cluster by removing upper case and accents :return: a diarization object """ fic = open(filename, 'r', encoding=encoding) diarization = Diar() try: for line in fic: line = re.sub(r'\s+', ' ', line) line = line.strip() # logging.debug(line) if line.startswith('#') or line.startswith(';;'): continue # split line into fields show, tmp, start, length, word = line.split() if normalize_cluster: word = str2str_normalize(word) # print(show, tmp, start, length, gender, channel, env, speaker) diarization.append(show=show, cluster=word, start=int(start), stop=int(length) + int(start)) except Exception as e: logging.error(sys.exc_info()[0]) # logging.error(line) fic.close() return diarization
def read_seg(cls, filename, normalize_cluster=False, encoding="utf8"): """ Read a segmentation file :param filename: the str input filename :param normalize_cluster: normalize the cluster speaker by removing upper case and accents :return: a diarization object """ fic = open(filename, 'r', encoding=encoding) diarization = Diar() if not diarization._attributes.exist('gender'): diarization.add_attribut(new_attribut='gender', default='U') if not diarization._attributes.exist('env'): diarization.add_attribut(new_attribut='env', default='U') if not diarization._attributes.exist('channel'): diarization.add_attribut(new_attribut='channel', default='U') try: for line in fic: line = re.sub(r'\s+', ' ', line) line = line.strip() # logging.debug(line) if line.startswith('#') or line.startswith(';;'): continue # split line into fields show, tmp, start, length, gender, channel, environment, name = line.split() if normalize_cluster: name = str2str_normalize(name) # print(show, tmp, start, length, gender, channel, env, speaker) diarization.append(show=show, cluster=name, start=int(start), stop=int(length) + int(start), env=environment, channel=channel, gender=gender) except Exception as e: logging.error(sys.exc_info()[0]) # logging.error(line) fic.close() return diarization
def read_stm(cls, filename, normalize_cluster=False, encoding="ISO-8859-1"): """ Read a segmentation file :param filename: the str input filename :param normalize_cluster: normalize the cluster by removing upper case and accents :return: a diarization object """ fic = open(filename, 'r', encoding=encoding) diarization = Diar() if not diarization._attributes.exist('gender'): diarization.add_attribut(new_attribut='gender', default='U') try: for line in fic: line = re.sub(r'\s+', ' ', line) line = line.strip() # logging.debug(line) if line.startswith('#') or line.startswith(';;'): continue # split line into fields split = line.split() show = split[0] loc = split[2] if normalize_cluster: loc = str2str_normalize(loc) start = int(float(split[3]) * 100) stop = int(float(split[4]) * 100) addon = split[5].replace(">", "").replace("<", "").replace(",", " ") lineBis = re.sub(r'\s+', ' ', addon) lineBis = lineBis.strip() gender = lineBis.split()[2] if normalize_cluster: word = str2str_normalize(word) # print(show, tmp, start, length, gender, channel, env, speaker) if gender == "female": diarization.append(show=show, cluster=loc, start=start, stop=stop, gender="F") elif gender == "male": diarization.append(show=show, cluster=loc, start=start, stop=stop, gender="M") else: diarization.append(show=show, cluster=loc, start=start, stop=stop) except Exception as e: logging.error(sys.exc_info()[0]) logging.error(line) fic.close() return diarization