def main(args):
    '''
    Test text normalizer using the Flowtron one based on python
    '''

    TEXT_NORMALIZATION_CLEANERS_FUNCTION_MAPPINGS = {
        'basic_cleaners': basic_cleaners,
        'transliteration_cleaners': transliteration_cleaners,
        'flowtron_cleaners': flowtron_cleaners,
        'english_cleaners': english_cleaners
    }

    path_test = args.path_test
    module_normalizer = args.module_normalizer
    function_mappings = TEXT_NORMALIZATION_CLEANERS_FUNCTION_MAPPINGS
    normalizer = Method().call_func(
        TEXT_NORMALIZATION_CLEANERS_FUNCTION_MAPPINGS, module_normalizer)
    '''
    Get unit test from path_test and get failed test
    '''
    obj = {'header': None, 'na_filter': False, 'quoting': csv.QUOTE_NONE}
    unit_test = DataReader(path_file=path_test, filetype="sv",
                           separator="\t").read_data_file(**obj)
    sentences_normalized = unit_test[unit_test.columns[0]].apply(
        lambda sentence: normalizer(sentence))
    sentences_target = unit_test[unit_test.columns[1]]
    results_test = ["Original\tNormalized\tExpected Normalization"]
    results_test += [
        (str(unit_test[unit_test.columns[0]][index]) + "\t" +
         str(sentence_normalized) + "\t" + str(sentences_target[index]),
         print("FAIL TEST: " + str(unit_test[unit_test.columns[0]][index]) +
               " - (Original)" + " ||| " + str(sentence_normalized) +
               " - (Normalized)" + " ||| " + str(sentences_target[index]) +
               " (Expected Normalization)"))[0]
        for index, sentence_normalized in enumerate(tqdm(sentences_normalized))
        if str(sentence_normalized) != str(sentences_target[index])
    ]

    if len(results_test) - 1 == 0: print("ALL TEST(S) PASSED!")
    else:
        print(
            str(len(results_test) - 1) + "/" + str(unit_test.shape[0]) +
            " TEST(S) FAILED!")
    '''
    Write results test
    '''
    test_filename = Method().get_filename(path_test)
    filename_results = "results" + "_" + test_filename + ".txt"
    path_results = os.path.join(directory_of_results, filename_results)
    DataWriter(results_test, path_results).write_data_file()
Ejemplo n.º 2
0
def main(args, project_name):
    '''
    Prepare language model using KenLM
    '''

    DATA_FOLDER_NAME = "DATA"

    directory_of_script = os.path.dirname(os.path.realpath(__file__))
    directory_of_results = os.path.join(directory_of_script, "results",
                                        PROJECT_NAME)
    os.makedirs(directory_of_results, exist_ok=True)

    language = args["language"].lower()
    clean_data = args["clean_data"]
    data_directory = args["data_directory"]
    path_list_training_data = args["path_list_training_data"]
    path_validation_data = args["path_validation_data"]
    path_list_training_data_cleaned = args["path_list_training_data_cleaned"]
    path_language_model = args["path_language_model"]
    perplexity = args["perplexity"]

    dir_text = os.path.join(directory_of_results, language)
    os.makedirs(dir_text, exist_ok=True)
    if data_directory is None:
        data_directory = os.path.join(directory_of_script, DATA_FOLDER_NAME,
                                      PROJECT_NAME)
        os.makedirs(data_directory, exist_ok=True)

    list_path_training_text = DataReader(
        path_list_training_data).read_data_file(keep_line_break=False)
    if path_list_training_data_cleaned is None:
        path_list_training_data_cleaned = os.path.join(
            directory_of_results, "list_training_data_cleaned.txt")
        list_path_training_text_cleaned = []
    else:
        list_path_training_text_cleaned = DataReader(
            path_list_training_data_cleaned).read_data_file(
                keep_line_break=False)

    if clean_data:
        print("Cleaning data...")
        for path_text in tqdm(list_path_training_text):
            text = DataReader(path_text).read_data_file(keep_line_break=False)
            if len(text) > 0:
                #Fix number of max parallelized process
                nb_max_parallelized_process = min(len(text), os.cpu_count())
                if language == "ar":
                    print("ArbTextProcessor...")
                    from modules.preprocessing.text.cleaning.arb_text_processor import ArbTextProcessor
                    with Pool(processes=nb_max_parallelized_process) as pool:
                        text = pool.map(ArbTextProcessor().clean,
                                        tqdm(set(text)))
                    # list_arg = [(line, 4, 'english') for line in text]
                print("Process_line...")
                with Pool(processes=nb_max_parallelized_process) as pool:
                    text = pool.map(process_line, tqdm(set(text)))
                text = [
                    element[0] for element in tqdm(text) if len(element) > 0
                ]

                original_filename = Method().get_filename(path_text)
                cleaned_filename = original_filename + "_" + "cleaned" + "_" + language + ".txt"
                path_filename_cleaned = os.path.join(dir_text,
                                                     cleaned_filename)
                list_path_training_text_cleaned.append(path_filename_cleaned)
                DataWriter(text, path_filename_cleaned).write_data_file()
        DataWriter(set(list_path_training_text_cleaned),
                   path_list_training_data_cleaned).write_data_file()

    print("Concatening cleaned data...")
    total_filename = "total_cleaned_text.txt"
    total_filename_rm_duplicated = "total_cleaned_text_rm_duplicated.txt"
    path_total_filename = os.path.join(dir_text, total_filename)
    path_total_filename_rm_duplicated = os.path.join(
        dir_text, total_filename_rm_duplicated)
    os.system("rm -rf " + path_total_filename)
    [
        os.system("cat " + path_text_cleaned + " >> " + path_total_filename)
        for path_text_cleaned in tqdm(list_path_training_text_cleaned)
    ]
    print("Removing duplicated line...")
    os.system("sort " + path_total_filename + " | uniq -u > " +
              path_total_filename_rm_duplicated)
    os.system("rm -rf " + path_total_filename)

    if perplexity:
        print("Compute LM perplexity of validation data...")
        validation_text = DataReader(path_validation_data).read_data_file(
            keep_line_break=False)
        validation_filename = Method().get_filename(path_validation_data)
        perplexity_filename = validation_filename + "_" + "perplexity" + ".txt"
        path_perplexity_filename = os.path.join(dir_text, perplexity_filename)
        model = kenlm.Model(path_language_model)
        perplexity_data = [
            s + "\t" + str(model.perplexity(s)) + "\t" + str(
                math.pow(
                    np.prod([
                        math.pow(10.0, score)
                        for score, _, _ in model.full_scores(s)
                    ]), 1.0 / len(list(model.full_scores(s)))))
            for s in validation_text
        ]
        DataWriter(perplexity_data, path_perplexity_filename).write_data_file()
Ejemplo n.º 3
0
            s + "\t" + str(model.perplexity(s)) + "\t" + str(
                math.pow(
                    np.prod([
                        math.pow(10.0, score)
                        for score, _, _ in model.full_scores(s)
                    ]), 1.0 / len(list(model.full_scores(s)))))
            for s in validation_text
        ]
        DataWriter(perplexity_data, path_perplexity_filename).write_data_file()


if __name__ == "__main__":

    PROJECT_NAME = "prepare_language_model"

    parser = argparse.ArgumentParser()
    parser.add_argument('-c',
                        '--config',
                        type=str,
                        help='JSON file for configuration')
    parser.add_argument('-p', '--params', nargs='+', default=[])
    args = parser.parse_args()
    args.rank = 0

    with open(args.config) as f:
        data = f.read()

    args_config = json.loads(data)[PROJECT_NAME]
    args_config = Method().update_params(args_config, args.params)

    main(args=args_config, project_name=PROJECT_NAME)
#             16,
#             False) for audio_path in list_audio_path]

# with Pool(processes=nb_max_parallelized_process) as pool:
#     pool.starmap(AudioPreprocessor().convert_audio, tqdm(list_arg))

print("Trimming audio...")
for index, sub_directory in enumerate(list_audio):
    for filename in tqdm(list(dict.fromkeys(list_youtube_code[index]))):
        list_good_index = [
            index for index, element in enumerate(list_youtube_code[index])
            if element == filename
        ]
        path_audio = os.path.join(dir_file, sub_directory,
                                  filename + "." + AUDIO_FORMAT)
        path_audio_output = os.path.join(dir_audio_data_files,
                                         filename + "." + AUDIO_FORMAT)
        list_trimmed_audio_path = AudioPreprocessor().trim_audio_wav(
            path_input=path_audio,
            path_output=path_audio_output,
            list_time=[list_time[index][i] for i in list_good_index])

        for index_trim, trimmed_audio_path in enumerate(
                list_trimmed_audio_path):
            DataWriter([[list_data[index][i]
                         for i in list_good_index][index_trim]],
                       os.path.join(
                           dir_text_data_files,
                           Method().get_filename(trimmed_audio_path) +
                           ".txt")).write_data_file()
Ejemplo n.º 5
0
def main(args, project_name):
    '''
    Prepare youtube data for Tacotron2 and Flowtron
    
    ________________________________________________________________________________________________________________
    The Tacotron 2 and WaveGlow model form a text-to-speech system that 
    enables user to synthesise a natural sounding speech from raw 
    transcripts without any additional prosody information. 
    The Tacotron 2 model produces mel spectrograms from input 
    text using encoder-decoder architecture. WaveGlow (also available via torch.hub) 
    is a flow-based model that consumes the mel spectrograms to generate speech.
    This implementation of Tacotron 2 model differs from the model described in the paper. 
    Our implementation uses Dropout instead of Zoneout to regularize the LSTM layers.
    
    ________________________________________________________________________________________________________________ 
    Flowtron: an Autoregressive Flow-based Network for Text-to-Mel-spectrogram Synthesis
    Rafael Valle, Kevin Shih, Ryan Prenger and Bryan Catanzaro

    In our recent paper we propose Flowtron: an autoregressive flow-based generative network for text-to-speech 
    synthesis with control over speech variation and style transfer. Flowtron borrows insights from Autoregressive 
    Flows and revamps Tacotron in order to provide high-quality and expressive mel-spectrogram synthesis. 
    Flowtron is optimized by maximizing the likelihood of the training data, which makes training simple and stable. 
    Flowtron learns an invertible mapping of data to a latent space that can be manipulated to control many aspects 
    of speech synthesis (pitch, tone, speech rate, cadence, accent).

    Our mean opinion scores (MOS) show that Flowtron matches state-of-the-art TTS models in terms of speech quality. 
    In addition, we provide results on control of speech variation, interpolation between samples and style transfer 
    between speakers seen and unseen during training.
    '''
    
    USER_CLUSTER = 'ks1'
    DIR_CLUSTER = os.path.join('/home',USER_CLUSTER)
    SEED = 42
    AUDIO_FORMAT = 'wav' #Required audio format for taflowtron
    DATA_FOLDER_NAME = "DATA"
    
    directory_of_script = os.path.dirname(os.path.realpath(__file__))
    directory_of_results = os.path.join(directory_of_script,"results",PROJECT_NAME)
    directory_of_data = os.path.join(directory_of_script,DATA_FOLDER_NAME,PROJECT_NAME)
    os.makedirs(directory_of_results,exist_ok=True)
    os.makedirs(directory_of_data,exist_ok=True)
    
    name_train_param_config = args["name_train_param_config"]
    name_data_config = args["name_data_config"]
    language = args["language"].lower()
    data_directory = args["data_directory"]
    directory_taflowtron_filelist = args["directory_taflowtron_filelist"]
    path_hparam_file = args["path_hparam_file"]
    path_symbols_file = args["path_symbols_file"]
    path_list_url = args["path_list_url"]
    path_youtube_cleaner = args["path_youtube_cleaner"]
    converter = args["converter"]
    silence = args["silence"]
    silence_threshold = args["silence_threshold"]
    remove_noise = args["remove_noise"]
    audio_normalization = args["audio_normalization"]
    generated_subtitle = args["generated_subtitle"]
    concatenate_vtt = args["concatenate_vtt"]
    max_limit_duration = args["max_limit_duration"]
    min_limit_duration = args["min_limit_duration"]
    tts_model = args["tts_model"]
    warmstart_model = args["warmstart_model"]
    batch_size = args["batch_size"]
    nb_speaker = args["nb_speaker"]
    
    if data_directory is None: data_directory = directory_of_data
    
    dir_tts_model = os.path.join('models','tts',tts_model)
    dir_cluster_data = os.path.join(DIR_CLUSTER,DATA_FOLDER_NAME,PROJECT_NAME)
    data_information = pd.DataFrame()
    data_filelist = []
    ITN_symbols = []
    voice_id = 0
    total_set_audio_length = 0
    source = Method().get_filename(path_list_url)
    
    '''
    Get audio and subtitle from youtube url
    '''
    list_url = DataReader(path_list_url).read_data_file()
    list_url = [line[:-1] for line in list_url]
    
    obj = {'header':None, 'na_filter':False, 'quoting':csv.QUOTE_NONE}
    cleaner_youtube = DataReader(path_youtube_cleaner).read_data_file(**obj)
    
    for url in tqdm(list_url):
        
        '''
        Download audio and subtitle from youtube using youtube-dl
        '''
        dir_original_youtube_data = os.path.join(data_directory,language,'original')
        os.makedirs(dir_original_youtube_data,exist_ok=True)
        path_subtitle, path_audio = MediaScraper().get_audio_youtube_data(url=url, 
                                                                          audio_format=AUDIO_FORMAT, 
                                                                          subtitle_language=language, 
                                                                          directory_output=dir_original_youtube_data,
                                                                          generated_subtitle=generated_subtitle)
        if re.search('NO_(MANUAL|GENERATED)_SUBTITLE\.vtt',path_subtitle) is not None:
            continue
        base = os.path.basename(path_audio)
        youtube_code = os.path.splitext(base)[0]
        '''
        Parse subtitles to get trim and text information
        '''
        print("Extracting information from vtt files...")
        data_subtitle = DataReader(path_subtitle).read_data_file()
        #data_subtitle = TextScraper().get_youtube_subtitle(youtube_id=youtube_code, generated_mode=generated_subtitle, language_code=[language])
        list_time, list_subtitle = DataPreprocessor().get_info_from_vtt(data=data_subtitle,
                                                                        cleaner=cleaner_youtube,
                                                                        concatenate=concatenate_vtt,
                                                                        max_limit_duration=max_limit_duration, 
                                                                        min_limit_duration=min_limit_duration,
                                                                        use_youtube_transcript_api=False)
        
        '''
        Trim audio regarding vtt information
        '''
        print("Trimming audio...")
        dir_audio_data_files = os.path.join(data_directory,language,source,name_data_config,youtube_code,'clips')
        os.makedirs(dir_audio_data_files,exist_ok=True)
        path_audio_output = os.path.join(dir_audio_data_files,youtube_code) + "." + AUDIO_FORMAT
        list_trimmed_audio_path = AudioPreprocessor().trim_audio_wav(path_input=path_audio,
                                                                     path_output=path_audio_output,
                                                                     list_time=list_time)
        
        #Fix number of max parallelized process
        nb_max_parallelized_process = min(len(list_trimmed_audio_path), os.cpu_count())
            
        '''
        Remove Noise
        '''
        if remove_noise:
            print("Revoming noise...")
            list_arg = [(audio_path, audio_path) for audio_path in list_trimmed_audio_path]
            with Pool(processes=nb_max_parallelized_process) as pool:
                pool.starmap(AudioPreprocessor().reduce_audio_noise, tqdm(list_arg))
            
        '''
        Normalize audio (Boosting quiet audio)
        '''
        if audio_normalization:
            print("Audio Normalization...")
            list_arg = [(audio_path, audio_path) for audio_path in list_trimmed_audio_path]
            with Pool(processes=nb_max_parallelized_process) as pool:
                pool.starmap(AudioPreprocessor().normalize_audio, tqdm(list_arg))           
            
        '''
        Add and/or Remove leading and trailing silence and/or convert audio
        '''
        if silence == "remove":
            print("Revoming leading/middle/trailing silence and convert audio...")
            dir_audio_data_files_trimmed = os.path.join(data_directory,language,source,name_data_config,youtube_code,'_temp_clips_trimmed')
            os.makedirs(dir_audio_data_files_trimmed,exist_ok=True)
            list_arg = [(audio_path, 
                         os.path.join(dir_audio_data_files_trimmed,Method().get_filename(audio_path) + "." + AUDIO_FORMAT),
                         True) for audio_path in list_trimmed_audio_path]
            with Pool(processes=nb_max_parallelized_process) as pool:
                pool.starmap(AudioPreprocessor().trim_silence, tqdm(list_arg))
            shutil.rmtree(dir_audio_data_files_trimmed)
            
            list_arg = [(audio_path, audio_path) for audio_path in list_trimmed_audio_path]
            with Pool(processes=nb_max_parallelized_process) as pool:
                pool.starmap(AudioPreprocessor().trim_lead_trail_silence, tqdm(list_arg))
        
        if silence == "add":
            print("Padding silence...")
            list_arg = [(audio_path,audio_path,silence_threshold,True,True) for audio_path in tqdm(list_trimmed_audio_path)]
            with Pool(processes=nb_max_parallelized_process) as pool:
                pool.starmap(AudioPreprocessor().add_lead_trail_audio_wav_silence, tqdm(list_arg))
                
        '''
        Convert audio data for taflowtron model
        '''
        if converter:
            print("Audio conversion...")
            dir_audio_data_files_converted = os.path.join(data_directory,language,source,name_data_config,youtube_code,'_temp_clips_converted')
            os.makedirs(dir_audio_data_files_converted,exist_ok=True)
            list_arg = [(audio_path,
                        os.path.join(dir_audio_data_files_converted,Method().get_filename(audio_path) + "." + AUDIO_FORMAT),
                        22050,
                        1,
                        16,
                        True) for audio_path in list_trimmed_audio_path]
            
            with Pool(processes=nb_max_parallelized_process) as pool:
                pool.starmap(AudioPreprocessor().convert_audio, tqdm(list_arg))    
            shutil.rmtree(dir_audio_data_files_converted)
        
        '''
        Get ITN symbols from subtitles
        '''
        print("Getting ITN symbols from data...")
        ITN_symbols += DataPreprocessor().get_ITN_data(data_text=list_subtitle, data_option=list_trimmed_audio_path)
        
        '''
        Update audio path for cluster
        '''
        list_trimmed_audio_path = [audio_path.replace(data_directory,dir_cluster_data) for audio_path in list_trimmed_audio_path]
    
        '''
        Create taflowtron filelist and data information
        '''
        list_duration = [(time[1]-time[0])/1000 for time in list_time]
        list_average_duration = [sum(list_duration)/len(list_duration)]*len(list_duration)
        list_total_video_extraction = [sum(list_duration)/3600]*len(list_duration)
        total_set_audio_length += sum(list_duration)
        mem_info = pd.DataFrame({"Audio Path":list_trimmed_audio_path,
                                 "Text":list_subtitle,
                                 "Speaker ID":voice_id,
                                 "Duration (seconds)":list_duration,
                                 "Average Duration (seconds)":list_average_duration,
                                 "Total Video Extraction Duration (hours)":list_total_video_extraction},
                                columns=["Audio Path","Text","Speaker ID", "Duration (seconds)", "Average Duration (seconds)", "Total Video Extraction Duration (hours)"])
        
        data_information = data_information.append(mem_info)
        
        data_filelist += [list_trimmed_audio_path[index] + "|" + subtitle + "|" + str(voice_id) for index,subtitle in enumerate(list_subtitle)]
        if nb_speaker > 1: voice_id += 1
        
    
    ITN_symbols = set(ITN_symbols)
    data_information["Total Set Extraction Duration (hours)"] = total_set_audio_length/3600
    data_information["Total Set Average Duration (seconds)"] = total_set_audio_length/data_information.shape[0]
        
    '''
    Train, test, validation splitting
    '''
    # In the first step we will split the data in training and remaining dataset
    X_train, X_valid = train_test_split(data_filelist,train_size=0.8, random_state=SEED)

    # Now since we want the valid and test size to be equal (10% each of overall data). 
    # we have to define valid_size=0.5 (that is 50% of remaining data)
    #X_valid, X_test = train_test_split(X_rem, test_size=0.5, random_state=SEED)

    '''
    Write Training, Test, and validation file and ITN symbols file and data information
    '''
    filename_train = "youtube_audio_text_train_filelist_" + language + "_" + name_data_config + "_" + source + ".txt"
    filename_valid = "youtube_audio_text_valid_filelist_" + language + "_" + name_data_config + "_" + source + ".txt"
    #filename_test = "youtube_audio_text_test_filelist_" + language + "_" + name_data_config + "_" + source + ".txt"
    filename_ITN_symbols = "youtube_audio_ITN_symbols_" + language + "_" + name_data_config + "_" + source + ".txt"
    filename_data_information = "youtube_audio_data_information_" + language + "_" + name_data_config + "_" + source + ".tsv"

    new_dir_filelist = os.path.join(directory_taflowtron_filelist,'youtube',language,source,name_data_config)
    dir_ITN = os.path.join(directory_of_results,'itn',language,source,name_data_config)
    dir_information = os.path.join(directory_of_results,'data_summary',language,source,name_data_config)
    os.makedirs(new_dir_filelist,exist_ok=True)
    os.makedirs(dir_ITN,exist_ok=True)
    os.makedirs(dir_information,exist_ok=True)
    
    path_train_filelist = os.path.join(new_dir_filelist,filename_train)
    path_valid_filelist = os.path.join(new_dir_filelist,filename_valid)
    #path_test_filelist = os.path.join(new_dir_filelist,filename_test)
    path_ITN_symbols = os.path.join(dir_ITN,filename_ITN_symbols)
    path_data_information = os.path.join(dir_information,filename_data_information)

    DataWriter(X_train, path_train_filelist).write_data_file()
    DataWriter(X_valid, path_valid_filelist).write_data_file()
    #DataWriter(X_test, path_test_filelist).write_data_file()
    DataWriter(ITN_symbols, path_ITN_symbols).write_data_file()
    DataWriter(data_information, path_data_information, header=True).write_data_file()

    '''
    Update hparams with filelist and batch size
    '''
    if path_hparam_file is not None:
        dir_hparam = os.path.dirname(path_hparam_file)
        new_path_hparam_file = os.path.join(dir_hparam,"config" + "_" + name_train_param_config + ".json")
        path_output_directory = os.path.join(DIR_CLUSTER,dir_tts_model,name_train_param_config,"outdir")
        warmstart_checkpoint_path = os.path.join(DIR_CLUSTER,dir_tts_model,warmstart_model)
        path_cluster_train_filelist = os.path.join(DIR_CLUSTER,'Repositories','AI','modules','tts',tts_model,'filelists','youtube',language,source,name_data_config,filename_train)
        path_cluster_valid_filelist = os.path.join(DIR_CLUSTER,'Repositories','AI','modules','tts',tts_model,'filelists','youtube',language,source,name_data_config,filename_valid)
        
        data_haparams = DataReader(path_hparam_file).read_data_file()
        data_haparams = DataWriter(data_haparams, new_path_hparam_file).write_edit_data(key='        "output_directory": ', value = '"' + path_output_directory + '",\n')
        if batch_size is not None: data_haparams = DataWriter(data_haparams, new_path_hparam_file).write_edit_data(key='        "batch_size": ', value = ' ' + str(batch_size) + ',\n')
        data_haparams = DataWriter(data_haparams, new_path_hparam_file).write_edit_data(key='        "warmstart_checkpoint_path": ', value = '"' + warmstart_checkpoint_path + '",\n')
        data_haparams = DataWriter(data_haparams, new_path_hparam_file).write_edit_data(key='        "training_files": ', value = '"' + path_cluster_train_filelist + '",\n')
        data_haparams = DataWriter(data_haparams, new_path_hparam_file).write_edit_data(key='        "validation_files": ', value = '"' + path_cluster_valid_filelist + '",\n')
        data_haparams = DataWriter(data_haparams, new_path_hparam_file).write_edit_data(key='        "n_speakers": ', value = ' ' + str(voice_id + 1) + ',\n')
Ejemplo n.º 6
0
from modules.Global.method import Method
from modules.preprocessing.audio import AudioPreprocessor

from multiprocessing import Pool
from tqdm import tqdm

AUDIO_FORMAT = "wav"

dir_audio = "/home/serkhane/Downloads/FR"
list_audio_path = [
    os.path.join(dir_audio, file) for file in os.listdir(dir_audio)
    if file.endswith(".flac")
]

#Fix number of max parallelized process
nb_max_parallelized_process = min(len(list_audio_path), os.cpu_count())

print("Audio conversion...")
dir_audio_data_files_converted = os.path.join(
    "/home/serkhane/converted_mediaspeech", 'wav')
os.makedirs(dir_audio_data_files_converted, exist_ok=True)
list_arg = [
    (audio_path,
     os.path.join(dir_audio_data_files_converted,
                  Method().get_filename(audio_path) + "." + AUDIO_FORMAT),
     16000, 1, 16, False) for audio_path in list_audio_path
]

with Pool(processes=nb_max_parallelized_process) as pool:
    pool.starmap(AudioPreprocessor().convert_audio, tqdm(list_arg))
def main(args, project_name):
    '''
    Prepare Mozilla common voice data for Tacotron2 and Flowtron
    
    ________________________________________________________________________________________________________________
    The Tacotron 2 and WaveGlow model form a text-to-speech system that 
    enables user to synthesise a natural sounding speech from raw 
    transcripts without any additional prosody information. 
    The Tacotron 2 model produces mel spectrograms from input 
    text using encoder-decoder architecture. WaveGlow (also available via torch.hub) 
    is a flow-based model that consumes the mel spectrograms to generate speech.
    This implementation of Tacotron 2 model differs from the model described in the paper. 
    Our implementation uses Dropout instead of Zoneout to regularize the LSTM layers.
    
    ________________________________________________________________________________________________________________ 
    Flowtron: an Autoregressive Flow-based Network for Text-to-Mel-spectrogram Synthesis
    Rafael Valle, Kevin Shih, Ryan Prenger and Bryan Catanzaro

    In our recent paper we propose Flowtron: an autoregressive flow-based generative network for text-to-speech 
    synthesis with control over speech variation and style transfer. Flowtron borrows insights from Autoregressive 
    Flows and revamps Tacotron in order to provide high-quality and expressive mel-spectrogram synthesis. 
    Flowtron is optimized by maximizing the likelihood of the training data, which makes training simple and stable. 
    Flowtron learns an invertible mapping of data to a latent space that can be manipulated to control many aspects 
    of speech synthesis (pitch, tone, speech rate, cadence, accent).

    Our mean opinion scores (MOS) show that Flowtron matches state-of-the-art TTS models in terms of speech quality. 
    In addition, we provide results on control of speech variation, interpolation between samples and style transfer 
    between speakers seen and unseen during training.
    '''

    SEED = 42
    USER_CLUSTER = 'ks1'
    DIR_CLUSTER = os.path.join('/home', USER_CLUSTER)
    AUDIO_FORMAT = 'wav'  #Required audio format for taflowtron
    LIST_AUDIO_FILES = ["validated.tsv"]
    USER_COLUMN = "client_id"
    PATH_COLUMN = "path"
    ELEMENT_COLUMN = "sentence"
    OPTION_COLUMN = 'gender'
    DATA_FOLDER_NAME = "DATA"

    directory_of_script = os.path.dirname(os.path.realpath(__file__))
    directory_of_results = os.path.join(directory_of_script, "results",
                                        PROJECT_NAME)
    directory_of_data = os.path.join(directory_of_script, DATA_FOLDER_NAME,
                                     PROJECT_NAME)
    os.makedirs(directory_of_results, exist_ok=True)
    os.makedirs(directory_of_data, exist_ok=True)

    name_train_param_config = args["name_train_param_config"]
    name_data_config = args["name_data_config"]
    language = args["language"].lower()
    gender = args["gender"]
    data_directory = args["data_directory"]
    directory_file_audio_info = args["directory_file_audio_info"]
    path_mcv_cleaner = args["path_mcv_cleaner"]
    directory_taflowtron_filelist = args["directory_taflowtron_filelist"]
    path_hparam_file = args["path_hparam_file"]
    path_symbols_file = args["path_symbols_file"]
    path_speaker_whitelist = args["path_speaker_whitelist"]
    silence = args["silence"]
    silence_threshold = args["silence_threshold"]
    remove_noise = args["remove_noise"]
    audio_normalization = args["audio_normalization"]
    tts_model = args["tts_model"]
    warmstart_model = args["warmstart_model"]
    batch_size = args["batch_size"]
    nb_speaker = args["nb_speaker"]

    if data_directory is None: data_directory = directory_of_data

    dir_tts_model = os.path.join('models', 'tts', tts_model)
    dir_cluster_data = os.path.join(DIR_CLUSTER, DATA_FOLDER_NAME)
    #data_information = pd.DataFrame()
    data_filelist = []
    ITN_symbols = []
    #total_set_audio_length = 0

    dir_audio_data_files = os.path.join(data_directory, language, 'clips')
    dir_audio_data_files_preprocessed = os.path.join(data_directory, language,
                                                     'clips_preprocessed')
    dir_audio_data_files_preprocessing = os.path.join(data_directory, language,
                                                      'clips_preprocessing')
    os.makedirs(dir_audio_data_files_preprocessed, exist_ok=True)
    os.makedirs(dir_audio_data_files_preprocessing, exist_ok=True)
    '''
    Aggregation of test, train and validation data file
    '''
    list_path_audio_files = [
        os.path.join(directory_file_audio_info, language, file)
        for file in LIST_AUDIO_FILES
    ]
    data_info = pd.DataFrame()
    obj = {'na_filter': False, 'quoting': csv.QUOTE_NONE}
    for path_file_audio_info in list_path_audio_files:
        data_read = DataReader(path_file_audio_info).read_data_file(**obj)
        data_info = data_info.append(data_read, ignore_index=True)
    '''
    Conversion of Mozilla Common Voice audio data information into taflowtron format
    '''
    print("Collect information from MCV data...")
    obj = {'header': None, 'na_filter': False, 'quoting': csv.QUOTE_NONE}
    cleaner_mcv = DataReader(path_mcv_cleaner).read_data_file(**obj)
    if path_speaker_whitelist is not None:
        speaker_whitelist = DataReader(path_speaker_whitelist).read_data_file(
            **obj)
        speaker_whitelist = [
            re.sub('\\n', '', element) for element in speaker_whitelist
        ]
    else:
        speaker_whitelist = None
    list_audio_path, list_subtitle, list_speaker_id, data_info_user, nb_speaker, dir_to_create, list_audio_path_original = DataPreprocessor(
        data_info).convert_data_mcv_to_taflowtron(
            user_column=USER_COLUMN,
            path_column=PATH_COLUMN,
            element_column=ELEMENT_COLUMN,
            data_directory=dir_audio_data_files,
            data_directory_preprocessed=dir_audio_data_files_preprocessed,
            cleaner=cleaner_mcv,
            tts=tts_model,
            option_column=OPTION_COLUMN,
            option_value=gender,
            speaker_whitelist=speaker_whitelist)

    list_audio_path_preprocessing = [
        os.path.join(dir_audio_data_files_preprocessing,
                     Method().get_filename(audio_path) + "." + AUDIO_FORMAT)
        for audio_path in list_audio_path_original
    ]

    #Fix number of max parallelized process
    nb_max_parallelized_process = min(len(list_audio_path_original),
                                      os.cpu_count())
    '''
    Convert audio data for taflowtron model
    '''
    print("Audio conversion...")
    list_arg = [(audio_path, list_audio_path_preprocessing[index], 22050, 1,
                 16)
                for index, audio_path in enumerate(list_audio_path_original)]

    with Pool(processes=nb_max_parallelized_process) as pool:
        pool.starmap(AudioPreprocessor().convert_audio, tqdm(list_arg))
    '''
    Remove Noise
    '''
    if remove_noise:
        print("Revoming noise...")
        list_arg = [(audio_path, audio_path)
                    for audio_path in list_audio_path_preprocessing]
        with Pool(processes=nb_max_parallelized_process) as pool:
            pool.starmap(AudioPreprocessor().reduce_audio_noise,
                         tqdm(list_arg))
    '''
    Normalize audio
    '''
    if audio_normalization:
        print("Audio Normalization...")
        list_arg = [(audio_path, audio_path)
                    for audio_path in list_audio_path_preprocessing]
        with Pool(processes=nb_max_parallelized_process) as pool:
            pool.starmap(AudioPreprocessor().normalize_audio, tqdm(list_arg))
    '''
    Add and/or Remove leading and trailing silence and/or convert audio
    '''
    if silence == "remove":
        print("Revoming leading/middle/trailing silence and convert audio...")
        dir_audio_data_files_trimmed = os.path.join(data_directory, language,
                                                    '_temp_clips_trimmed')
        os.makedirs(dir_audio_data_files_trimmed, exist_ok=True)
        list_arg = [
            (audio_path,
             os.path.join(
                 dir_audio_data_files_trimmed,
                 Method().get_filename(audio_path) + "." + AUDIO_FORMAT), True)
            for audio_path in list_audio_path_preprocessing
        ]
        with Pool(processes=nb_max_parallelized_process) as pool:
            pool.starmap(AudioPreprocessor().trim_silence, tqdm(list_arg))
        shutil.rmtree(dir_audio_data_files_trimmed)

        list_arg = [(audio_path, audio_path)
                    for audio_path in list_audio_path_preprocessing]
        with Pool(processes=nb_max_parallelized_process) as pool:
            pool.starmap(AudioPreprocessor().trim_lead_trail_silence,
                         tqdm(list_arg))

    if silence == "add":
        print("Padding silence...")
        list_arg = [(audio_path, audio_path, silence_threshold, True, True)
                    for audio_path in tqdm(list_audio_path_preprocessing)]
        with Pool(processes=nb_max_parallelized_process) as pool:
            pool.starmap(AudioPreprocessor().add_lead_trail_audio_wav_silence,
                         tqdm(list_arg))
    '''
    Copying audio files into ImageNET tree style
    '''
    print("Copying audio files into ImageNET tree style...")
    [os.makedirs(directory, exist_ok=True) for directory in dir_to_create]
    list_arg = [
        (path_audio, list_audio_path[index])
        for index, path_audio in enumerate(list_audio_path_preprocessing)
    ]
    with Pool(processes=nb_max_parallelized_process) as pool:
        pool.starmap(shutil.move, tqdm(list_arg))
    shutil.rmtree(dir_audio_data_files_preprocessing)
    '''
    Get ITN symbols from subtitles
    '''
    print("Getting ITN symbols from data...")
    ITN_symbols = DataPreprocessor().get_ITN_data(data_text=list_subtitle,
                                                  data_option=list_audio_path)
    '''
    Update audio path for cluster
    '''
    list_audio_path = [
        audio_path.replace(data_directory, dir_cluster_data)
        for audio_path in list_audio_path
    ]
    '''
    Create taflowtron filelist
    '''
    data_filelist = [
        list_audio_path[index] + "|" + list_subtitle[index] + "|" +
        list_speaker_id[index] for index in range(len(list_subtitle))
    ]
    '''
    Train, test, validation splitting
    '''

    # In the first step we will split the data in training and remaining dataset
    X_train, X_valid = train_test_split(data_filelist,
                                        train_size=0.8,
                                        random_state=SEED)

    # Now since we want the valid and test size to be equal (10% each of overall data).
    # we have to define valid_size=0.5 (that is 50% of remaining data)
    #X_valid, X_test = train_test_split(X_rem, test_size=0.5, random_state=SEED)
    '''
    Write Training, Test, and validation file
    '''
    filename_train = "mcv_audio_text_train_filelist_" + language + "_" + str(
        gender) + ".txt"
    filename_valid = "mcv_audio_text_valid_filelist_" + language + "_" + str(
        gender) + ".txt"
    #filename_test = "mcv_audio_text_test_filelist_" + language + "_" + str(gender) + ".txt"
    filename_ITN_symbols = "mcv_audio_ITN_symbols_" + language + "_" + name_data_config + "_" + ".txt"
    filename_user_information = "mcv_user_voice_informations_" + language + ".tsv"

    new_dir_filelist = os.path.join(directory_taflowtron_filelist, 'mcv',
                                    language, name_data_config)
    dir_ITN = os.path.join(directory_of_results, 'itn', language, "mcv",
                           name_data_config)
    dir_information = os.path.join(directory_of_results, 'data_summary',
                                   language, "mcv", name_data_config)
    os.makedirs(new_dir_filelist, exist_ok=True)
    os.makedirs(dir_ITN, exist_ok=True)
    os.makedirs(dir_information, exist_ok=True)

    path_train_filelist = os.path.join(new_dir_filelist, filename_train)
    path_valid_filelist = os.path.join(new_dir_filelist, filename_valid)
    #path_test_filelist = os.path.join(directory_taflowtron_filelist,filename_test)
    path_filename_user_information = os.path.join(dir_information,
                                                  filename_user_information)
    path_ITN_symbols = os.path.join(dir_ITN, filename_ITN_symbols)

    DataWriter(X_train, path_train_filelist).write_data_file()
    DataWriter(X_valid, path_valid_filelist).write_data_file()
    #DataWriter(X_test, path_test_filelist).write_data_file()
    DataWriter(ITN_symbols, path_ITN_symbols).write_data_file()
    DataWriter(data_info_user,
               path_filename_user_information).write_data_file()
    '''
    Update hparams with filelist and batch size
    '''
    if path_hparam_file is not None:
        dir_hparam = os.path.dirname(path_hparam_file)
        new_path_hparam_file = os.path.join(
            dir_hparam, "config" + "_" + name_train_param_config + ".json")
        path_output_directory = os.path.join(DIR_CLUSTER, dir_tts_model,
                                             name_train_param_config, "outdir")
        warmstart_checkpoint_path = os.path.join(DIR_CLUSTER, dir_tts_model,
                                                 warmstart_model)
        path_cluster_train_filelist = os.path.join(
            DIR_CLUSTER, 'Repositories', 'AI', 'modules', 'tts', tts_model,
            'filelists', 'mcv', language, name_data_config, filename_train)
        path_cluster_valid_filelist = os.path.join(
            DIR_CLUSTER, 'Repositories', 'AI', 'modules', 'tts', tts_model,
            'filelists', 'mcv', language, name_data_config, filename_valid)

        data_haparams = DataReader(path_hparam_file).read_data_file()
        data_haparams = DataWriter(
            data_haparams, new_path_hparam_file).write_edit_data(
                key='        "output_directory": ',
                value='"' + path_output_directory + '",\n')
        if batch_size is not None:
            data_haparams = DataWriter(data_haparams,
                                       new_path_hparam_file).write_edit_data(
                                           key='        "batch_size": ',
                                           value=' ' + str(batch_size) + ',\n')
        data_haparams = DataWriter(
            data_haparams, new_path_hparam_file).write_edit_data(
                key='        "warmstart_checkpoint_path": ',
                value='"' + warmstart_checkpoint_path + '",\n')
        data_haparams = DataWriter(data_haparams,
                                   new_path_hparam_file).write_edit_data(
                                       key='        "language": ',
                                       value='"' + language + '",\n')
        data_haparams = DataWriter(
            data_haparams, new_path_hparam_file).write_edit_data(
                key='        "training_files": ',
                value='"' + path_cluster_train_filelist + '",\n')
        data_haparams = DataWriter(
            data_haparams, new_path_hparam_file).write_edit_data(
                key='        "validation_files": ',
                value='"' + path_cluster_valid_filelist + '",\n')
        # data_haparams = DataWriter(data_haparams, new_path_hparam_file).write_edit_data(key='        "cmudict_path": ', value = '"' + 'data/data' + '_' + language + '/cmudict_dictionary' + '",\n')
        data_haparams = DataWriter(data_haparams,
                                   new_path_hparam_file).write_edit_data(
                                       key='        "n_speakers": ',
                                       value=str(nb_speaker) + ',\n')

    if path_symbols_file is not None:
        '''
        Update symbols
        '''
        if tts_model == "tacotron2":
            data_symbols = DataReader(path_symbols_file).read_data_file()
            pad = DataReader(path_symbols_file).read_data_value(
                key="_pad        = ")[1:-1]
            punctuation = DataReader(path_symbols_file).read_data_value(
                key="_punctuation = ")[1:-1]
            special = DataReader(path_symbols_file).read_data_value(
                key="_special = ")[1:-1]
            not_letter = pad + punctuation + special

            unique_char = set("".join(data_info[ELEMENT_COLUMN]))
            unique_char = "".join(
                [char for char in unique_char if char not in not_letter])
            unique_char = "".join(
                set(unique_char.lower() + unique_char.upper()))

            DataWriter(data_symbols, path_symbols_file).write_edit_data(
                key='_letters = ', value="'" + unique_char + "'\n")
        if tts_model == "flowtron":
            data_symbols = DataReader(path_symbols_file).read_data_file()
            punctuation = DataReader(path_symbols_file).read_data_value(
                key="_punctuation = ")[1:-1]
            math = DataReader(path_symbols_file).read_data_value(
                key="_math = ")[1:-1]
            special = DataReader(path_symbols_file).read_data_value(
                key="_special = ")[1:-1]
            accented = DataReader(path_symbols_file).read_data_value(
                key="_accented = ")[1:-1]
            numbers = DataReader(path_symbols_file).read_data_value(
                key="_numbers = ")[1:-1]
            not_letter = punctuation + math + special + accented + numbers

            unique_char = set("".join(data_info[ELEMENT_COLUMN]))
            unique_char = "".join(
                [char for char in unique_char if char not in not_letter])
            unique_char = "".join(
                set(unique_char.lower() + unique_char.upper()))

            DataWriter(data_symbols, path_symbols_file).write_edit_data(
                key='_letters = ', value="'" + unique_char + "'\n")