Example #1
0
 def read_rttm(cls, filename, normalize_cluster=False, encoding="utf8"):
     """
     Read rttm file
     :param filename: str input filename
     :param normalize_cluster: normalize the cluster by removing upper case and accents
     :return: a diarization object
     """
     fic = open(filename, 'r', encoding=encoding)
     diarization = Diar()
     if not diarization._attributes.exist('gender'):
         diarization.add_attribut(new_attribut='gender', default='U')
     try:
         for line in fic:
             line = re.sub(r'\s+', ' ', line)
             line = line.strip()
             if line.startswith('#') or line.startswith(';;'):
                 continue
             # split line into fields
             spk, show, tmp0, start_str, length, tmp1, tmp2, cluster, tmp3 = line.split()
             if spk == "SPEAKER":
                 start = int(round(float(start_str) * 100, 0))
                 stop = start + int(round(float(length) * 100, 0))
                 if normalize_cluster:
                     cluster = str2str_normalize(cluster)
                 diarization.append(show=show, cluster=cluster, start=start, stop=stop)
     except Exception as e:
         logging.error(sys.exc_info()[0])
         logging.error(line)
     fic.close()
     return diarization
Example #2
0
    def read_mdtm(cls, filename, normalize_cluster=False, encoding="utf8"):
        """
        Read a MDTM file
        :param filename: the str input filename
        :param normalize_cluster: normalize the cluster by removing upper case
        and accents
        :return: a diarization object
        """

        fic = open(filename, 'r', encoding=encoding)
        diarization = Diar()
        if not diarization._attributes.exist('gender'):
            diarization.add_attribut(new_attribut='gender', default='U')
        for line in fic:
            line = line.strip()
            line = re.sub(r'\s+', ' ', line)
            logging.debug(line)
            if line.startswith('#') or line.startswith(';;'):
                continue
            # split line into fields
            show, tmp, start_str, length, t, score, gender, cluster = line.split()
            start = int(round(float(start_str) * 100, 0))
            stop = start + int(round(float(length) * 100, 0))
            if normalize_cluster:
                cluster = str2str_normalize(cluster)
            # print(show, tmp, start, length, gender, channel, env, speaker)
            diarization.append(show=show, cluster=cluster, start=start, stop=stop, gender=gender)
        fic.close()
        return diarization
Example #3
0
 def read_ctm(cls, filename, normalize_cluster=False, encoding="utf8"):
     """
     Read a segmentation file
     :param filename: the str input filename
     :param normalize_cluster: normalize the cluster by removing upper case
     and accents
     :return: a diarization object
     """
     fic = open(filename, 'r', encoding=encoding)
     diarization = Diar()
     try:
         for line in fic:
             line = re.sub(r'\s+', ' ', line)
             line = line.strip()
             # logging.debug(line)
             if line.startswith('#') or line.startswith(';;'):
                 continue
             # split line into fields
             show, tmp, start, length, word = line.split()
             if normalize_cluster:
                 word = str2str_normalize(word)
             # print(show, tmp, start, length, gender, channel, env, speaker)
             diarization.append(show=show, cluster=word, start=int(start), stop=int(length) + int(start))
     except Exception as e:
         logging.error(sys.exc_info()[0])
         # logging.error(line)
     fic.close()
     return diarization
Example #4
0
 def read_seg(cls, filename, normalize_cluster=False, encoding="utf8"):
     """
     Read a segmentation file
     :param filename: the str input filename
     :param normalize_cluster: normalize the cluster speaker by removing upper
     case and accents
     :return: a diarization object
     """
     fic = open(filename, 'r', encoding=encoding)
     diarization = Diar()
     if not diarization._attributes.exist('gender'):
         diarization.add_attribut(new_attribut='gender', default='U')
     if not diarization._attributes.exist('env'):
         diarization.add_attribut(new_attribut='env', default='U')
     if not diarization._attributes.exist('channel'):
         diarization.add_attribut(new_attribut='channel', default='U')
     try:
         for line in fic:
             line = re.sub(r'\s+', ' ', line)
             line = line.strip()
             # logging.debug(line)
             if line.startswith('#') or line.startswith(';;'):
                 continue
             # split line into fields
             show, tmp, start, length, gender, channel, environment, name = line.split()
             if normalize_cluster:
                 name = str2str_normalize(name)
             # print(show, tmp, start, length, gender, channel, env, speaker)
             diarization.append(show=show, cluster=name, start=int(start), stop=int(length) + int(start), env=environment, channel=channel, gender=gender)
     except Exception as e:
         logging.error(sys.exc_info()[0])
         # logging.error(line)
     fic.close()
     return diarization
Example #5
0
 def read_stm(cls, filename, normalize_cluster=False, encoding="ISO-8859-1"):
     """
     Read a segmentation file
     :param filename: the str input filename
     :param normalize_cluster: normalize the cluster by removing upper case
     and accents
     :return: a diarization object
     """
     fic = open(filename, 'r', encoding=encoding)
     diarization = Diar()
     if not diarization._attributes.exist('gender'):
         diarization.add_attribut(new_attribut='gender', default='U')
     try:
         for line in fic:
             line = re.sub(r'\s+', ' ', line)
             line = line.strip()
             # logging.debug(line)
             if line.startswith('#') or line.startswith(';;'):
                 continue
             # split line into fields
             split = line.split()
             show = split[0]
             loc = split[2]
             if normalize_cluster:
                 loc = str2str_normalize(loc)
             start = int(float(split[3]) * 100)
             stop = int(float(split[4]) * 100)
             addon = split[5].replace(">", "").replace("<", "").replace(",", " ")
             lineBis = re.sub(r'\s+', ' ', addon)
             lineBis = lineBis.strip()
             gender = lineBis.split()[2]
             if normalize_cluster:
                 word = str2str_normalize(word)
             # print(show, tmp, start, length, gender, channel, env, speaker)
             if gender == "female":
                 diarization.append(show=show, cluster=loc, start=start, stop=stop, gender="F")
             elif gender == "male":
                 diarization.append(show=show, cluster=loc, start=start, stop=stop, gender="M")
             else:
                 diarization.append(show=show, cluster=loc, start=start, stop=stop)
     except Exception as e:
         logging.error(sys.exc_info()[0])
         logging.error(line)
     fic.close()
     return diarization