def get_corpus(self):
        SKIP_LIST = []  ## filter(None, CLI_ARGS.skiplist.split(","))
        ##extract training and development datasets
        ##do data merge, ArchiveImporter make final train/test/dev datasets
        utterances = {}
        audios = []

        wav_root_dir = os.path.join(self.origin_data_path, 'it_IT')

        # Get audiofile path and transcript for each sentence in tsv
        samples = []
        glob_dir = os.path.join(wav_root_dir, "**/metadata.csv")
        for record in glob(glob_dir, recursive=True):
            if any(map(lambda sk: sk in record, SKIP_LIST)):
                continue

            enc = encoding_from_path(record)
            with open(record, "r", encoding=enc) as rec:
                for re in rec.readlines():
                    re = re.strip().split("|")
                    audio = os.path.join(os.path.dirname(record), "wavs",
                                         re[0] + ".wav")
                    transcript = re[2]
                    samples.append((audio, transcript))
                    ##append data manifest
                    utterances[audio] = transcript
                    audios.append(audio)

        ##collect corpus
        corpus = Corpus(utterances, audios)
        #################
        ## evalita2009 have clips WAV 16000Hz - 1 chnl
        ## not require resample
        corpus.make_wav_resample = False
        return corpus
Example #2
0
    def get_corpus(self):
        ##extract training and development datasets
        ##do data merge, ArchiveImporter make final train/test/dev datasets
        utterances = {}
        audios = []

        for d in ("development","training"):       
            wav_dir = os.path.join(self.origin_data_path, d)
            transcript_path = os.path.join(self.origin_data_path, self.archive_name, d+".txt")
            
            with open(transcript_path) as fin:
                for line in fin:                 
                    t_s = line.split(".wav", maxsplit=1)
                    audio_file_rel_path = t_s[0] + '.wav'
                    transcript = t_s[1].strip()

                    ###preprocess transcript - replace numbers from numeric to literal
                    transcript = ' '.join([EVALITA_DICT_CONVERSION.get(c,'') for c in transcript])
                    transcript =  re.sub(r'\s+', ' ', transcript)            
                    ## need absolute path for audios, separator cross os 
                    _audio_file_rel_path = ''.join([ os.path.sep if c=='/' else c for c in audio_file_rel_path])
                    audio_file_abs_path =  os.path.join(self.origin_data_path, _audio_file_rel_path )                
                    ##append data manifest
                    utterances[audio_file_abs_path] = transcript
                    audios.append(audio_file_abs_path)

        ##collect corpus
        corpus = Corpus(utterances,audios)
        #################
        ## evalita2009 have clips WAV 16000Hz - 1 chnl
        ## not require resample
        corpus.make_wav_resample = False
        return corpus
Example #3
0
    def get_corpus(self):
        ##extract training and development datasets
        ##do data merge, ArchiveImporter make final train/test/dev datasets
        utterances = {}
        audios = []
        wav_dir = os.path.join(self.origin_data_path, self.extract_dir, "wav",
                               "IT")
        text_dir = os.path.join(self.origin_data_path, self.extract_dir, "txt",
                                "IT")

        for subdir, dirs, files in os.walk(wav_dir):
            uuu = 0
            for _dir in dirs:
                curr_wav_dir = os.path.join(subdir, _dir)
                curr_txt_dir = os.path.join(text_dir, _dir)

                ##iterate wav file current folder
                for fname in os.listdir(curr_wav_dir):
                    fname = os.fsdecode(fname)

                    wav_file_path = os.path.join(wav_dir, _dir, fname)
                    txt_file_path = os.path.join(curr_txt_dir,
                                                 fname.split('.')[0] + '.txt')
                    if (not os.path.isfile(txt_file_path)):
                        print('audio file {} doesn\'t have a file transcript')
                        continue

                    ##read file transcript
                    transcript = ''

                    ##files have different encoding (utf-8, utf_16_be, etc..)
                    ##need check to open file with correct encoding
                    file_encoding = 'utf-8'
                    enc = CnM.from_path(txt_file_path).best().first()
                    file_encoding = enc.encoding
                    ##fix same encoding
                    if (file_encoding == 'big5' or file_encoding == 'cp1252'):
                        file_encoding = 'utf-8'

                    with open(txt_file_path, "r", encoding=file_encoding) as f:
                        transcript += f.readline()

                    transcript = transcript.strip()
                    ##append data manifest
                    utterances[wav_file_path] = transcript
                    audios.append(wav_file_path)

        ##collect corpus
        corpus = Corpus(utterances, audios)
        #################
        ## SIWIS clips need resample wav - clips is 44100Hz  706 kb/s (1 chnl)
        ## not require resample
        corpus.make_wav_resample = True
        return corpus
    def get_corpus(self):
        SKIP_LIST = [] ## filter(None, CLI_ARGS.skiplist.split(","))
        ##extract training and development datasets
        ##do data merge, ArchiveImporter make final train/test/dev datasets
        utterances = {}
        audios = []

        fixed_token = {}
        wav_root_dir = os.path.join(self.origin_data_path,'it_IT')

        bad_examples = self.get_bad_examples()

        # Get audiofile path and transcript for each sentence in tsv
        glob_dir = os.path.join(wav_root_dir, "**/metadata.csv")
        for record in glob(glob_dir, recursive=True):
            if any(
                map(lambda sk: sk in record, SKIP_LIST)
            ):
                continue

            enc = encoding_from_path(record)
            with open(record, "r",encoding=enc) as rec:
                for re in rec.readlines():
                    re = re.strip().split("|")

                    filename = re[0]
                    ##filter bad examples (https://github.com/MozillaItalia/DeepSpeech-Italian-Model/issues/124#issuecomment-798613031)
                    if(filename in bad_examples):
                        continue
                    audio = os.path.join(os.path.dirname(record), "wavs", filename + ".wav")
                    transcript_source = re[1]
                    transcript = re[2]
                    ##in MLS normalization of character '’'  is  wrong in transcription normalization
                    transcript =  fix_apostrophe(transcript_source,transcript,fixed_token)    

                    ##append data manifest
                    utterances[audio] = transcript  
                    audios.append(audio)

        ##collect corpus
        corpus = Corpus(utterances,audios)
        #################
        ## evalita2009 have clips WAV 16000Hz - 1 chnl
        ## not require resample
        corpus.make_wav_resample = True

        ##self.save_wrong_token_dictionary(fixed_token)


        return corpus
    def get_corpus(self):

        ###open wrong token saved on M-AILABS importer  to fix apostrophe issue
        #fixed_tokens = load_mailabs_fixed_token()

        ##extract training and development datasets
        ##do data merge, ArchiveImporter make final train/test/dev datasets
        utterances = {}
        audios = []
        count = 0
        for d in ("train", "dev", "test"):
            #
            #if(count==1):
            #    break
            wav_dir = os.path.join(self.origin_data_path, self.archive_name, d,
                                   "audio")
            transcript_path = os.path.join(self.origin_data_path,
                                           self.archive_name, d,
                                           "transcripts.txt")

            with open(transcript_path, encoding="utf-8") as fin:
                for line in fin:
                    t_s = re.split(r'\t+', line)

                    flac_file_path_t = t_s[0].split('_')
                    file_name = t_s[0] + ".flac"
                    audio_file_path = os.path.join(wav_dir,
                                                   flac_file_path_t[0],
                                                   flac_file_path_t[1],
                                                   file_name)
                    transcript = t_s[1].strip()
                    ###  fix apostrophe bug with raw data in m-ailabs - see https://github.com/MozillaItalia/DeepSpeech-Italian-Model/issues/124
                    #transcript =  fix_apostrophe(transcript,fixed_tokens)
                    ##append data manifest
                    utterances[audio_file_path] = transcript
                    audios.append(audio_file_path)
                    count += 1

                    #if(count==1):
                    #   break

        ##collect corpus
        corpus = Corpus(utterances, audios)
        #################
        ##
        ##  audio .flac require resample
        corpus.make_wav_resample = True
        return corpus
    def get_corpus(self):
        ##extract training and development datasets
        ##do data merge, ArchiveImporter make final train/test/dev datasets
        utterances = {}
        audios = []
        count = 0
        for d in ("train", "dev", "test"):
            #
            #if(count==1):
            #    break
            wav_dir = os.path.join(self.origin_data_path, self.archive_name, d,
                                   "audio")
            transcript_path = os.path.join(self.origin_data_path,
                                           self.archive_name, d,
                                           "transcripts.txt")

            with open(transcript_path) as fin:
                for line in fin:
                    t_s = re.split(r'\t+', line)

                    flac_file_path_t = t_s[0].split('_')
                    file_name = t_s[0] + ".flac"
                    audio_file_path = os.path.join(wav_dir,
                                                   flac_file_path_t[0],
                                                   flac_file_path_t[1],
                                                   file_name)
                    transcript = t_s[1].strip()
                    ##append data manifest
                    utterances[audio_file_path] = transcript
                    audios.append(audio_file_path)
                    count += 1

                    #if(count==1):
                    #    break

        ##collect corpus
        corpus = Corpus(utterances, audios)
        #################
        ##
        ##  audio .flac require resample
        corpus.make_wav_resample = True
        return corpus
Example #7
0
    def get_corpus(self):
        ##extract training and development datasets
        ##do data merge, ArchiveImporter make final train/test/dev datasets
        utterances = {}
        audios = []
        wav_dir  = os.path.join(self.origin_data_path, self.archive_name, "wav")
        text_file = os.path.join(self.origin_data_path, self.archive_name, "etc","PROMPTS")  

        wav_files = [f for f in os.listdir(wav_dir) if os.path.isfile(os.path.join(wav_dir, f))]
        count=0

        with open(text_file,encoding='utf-8') as f:
            for line in f:
                temp_2 = line.split(" ", 1)
                ref_url = temp_2[0]
                transcript = temp_2[1].lower()
                transcript = transcript.replace('\n','')

                temp = ref_url.split('/')
                speaker_id = temp[0]
                file_n = temp[-1]
                for wav_file in wav_files:
                    if(file_n in wav_file):
                        ##found , is this
                        wav_file_path = os.path.join(wav_dir,wav_file)
                        utterances[ wav_file_path] = transcript
                        audios.append(wav_file_path) 
                        count +=1
                        break


        ##collect corpus
        corpus = Corpus(utterances,audios)
        #################
        ## VoxForge need wav resample
        ## 
        corpus.make_wav_resample = True
        return corpus
    def get_corpus(self):
        ##extract training and development datasets
        ##do data merge, ArchiveImporter make final train/test/dev datasets
        utterances = {}
        audios = []
        wav_dir = os.path.join(self.origin_data_path, self.archive_name,
                               "wav_1.0.0")
        text_dir = os.path.join(self.origin_data_path, self.archive_name,
                                "lab_1.0.0")

        ##iterate wav file current folder
        count = 0
        for fname in os.listdir(wav_dir):

            fname = os.fsdecode(fname)
            if (not fname.lower().endswith('.wav')):
                continue

            wav_file_path = os.path.join(wav_dir, fname)
            txt_file_path = os.path.join(text_dir,
                                         fname.split('.')[0] + '.lab')
            if (not os.path.isfile(txt_file_path)):
                print('audio file {} doesn\'t have a file transcript'.format(
                    str(wav_file_path)))
                continue

            ##read file transcript
            transcript = ''
            transcript_annotaded = ''
            file_encoding = 'utf-8'
            with open(txt_file_path) as f:
                transcript_annotaded = f.readlines()

            ##parse annotation - build a clean transcript
            transcript_toks = []
            for line in transcript_annotaded:

                row_data = line.split()
                if (len(row_data) <= 3):
                    ##no transcrit here
                    continue

                annotation = row_data[2]
                if (annotation == SILENCE_ANNOTATION):
                    continue
                curr_text = row_data[3:]
                curr_text = ' '.join(curr_text)
                ##clear text -   accented char escape
                curr_text = string_escape(curr_text)

                transcript_toks.append(curr_text)

            transcript = ' '.join(transcript_toks)
            transcript = transcript.strip()
            ##append data manifest
            utterances[wav_file_path] = transcript
            audios.append(wav_file_path)
            count += 1

        ##collect corpus
        corpus = Corpus(utterances, audios)
        #################
        ## MSPKA need wav resample - clips is 22050Hz  353 kb/s (1 chnl)
        ##
        corpus.make_wav_resample = True
        return corpus
    def get_corpus(self):
        ##extract training and development datasets
        ##do data merge, ArchiveImporter make final train/test/dev datasets
        utterances = {}
        audios = []
        wav_dir  = os.path.join(self.origin_data_path, self.extract_dir, "wav","IT")
        text_dir = os.path.join(self.origin_data_path, self.extract_dir, "txt","IT")
        ##read transcript in prompts.txt
        transcripts = {}
        ##encoding prompts files is cp1252
        encoding = 'cp1252'
        ###read transcript from prompts file
        with open(os.path.join(self.origin_data_path,self.extract_dir, "prompts","ALL_IT_prompts_iso.txt"), "r",encoding=encoding) as f:
            line = f.readline()
            while line:
                temp = re.split(r'\t', line)
                filename = temp[0]
                transcript = temp[1].strip()
                transcripts[filename] = transcript
                # use realine() to read next line
                line = f.readline()

        for subdir, dirs, files in os.walk(wav_dir):
            uuu=0
            for _dir in dirs:

                if(_dir=='converted'):
                    ##wav converted in a previous session run
                    continue

                curr_wav_dir =  os.path.join(subdir, _dir)
                #
                ##iterate wav file current folder
                for fname in os.listdir(curr_wav_dir):
                    fname = os.fsdecode(fname)
                    if(fname=='converted'):
                        ##skip wav converted by importer
                        continue
                    wav_file_path = os.path.join(wav_dir, _dir,fname) 

                    try:
                        transcript = transcripts[fname.replace('.wav','.txt')]
                    except:                        
                        curr_txt_dir =  os.path.join(text_dir, _dir)
                        txt_file_path = os.path.join(curr_txt_dir, fname.split('.')[0]+'.txt')
                        #print('missing prompts , read transcript from file {}'.format(txt_file_path))
                        if(not os.path.isfile(txt_file_path)):
                            raise ValueError('audio file {} doesn\'t have a file transcript'.format(wav_file_path))  
                            #continue
                        transcript = self.read_txt_file(txt_file_path) 
            
                    ##append data manifest
                    utterances[wav_file_path] = transcript
                    audios.append(wav_file_path)     

        ##collect corpus
        corpus = Corpus(utterances,audios)
        #################
        ## SIWIS clips need resample wav - clips is 44100Hz  706 kb/s (1 chnl) 
        ## not require resample
        corpus.make_wav_resample = True
        return corpus