Exemple #1
0
def generate_audio_response(text, speaker_id):
    global global_config

    model_name = os.path.basename(global_config.load_path)
    isKorean=global_config.is_korean
    
    hashed_text = hashlib.md5(text.encode('utf-8')).hexdigest()

    relative_dir_path = os.path.join(AUDIO_DIR, model_name)
    relative_audio_path = os.path.join(
            relative_dir_path, "{}.{}.wav".format(hashed_text, speaker_id))
    real_path = os.path.join(ROOT_PATH, relative_audio_path)
    makedirs(os.path.dirname(real_path))

    if not os.path.exists(add_postfix(real_path, 0)):
        try:
            audio = synthesizer.synthesize(
                    [text], paths=[real_path], speaker_ids=[speaker_id],
                    attention_trim=True, isKorean=isKorean)[0]
        except Exception as e:
            traceback.print_exc()
            return jsonify(success=False), 400

    return send_file(
            add_postfix(relative_audio_path, 0),
            mimetype="audio/wav", 
            as_attachment=True, 
            attachment_filename=hashed_text + ".wav")

    response = make_response(audio)
    response.headers['Content-Type'] = 'audio/wav'
    response.headers['Content-Disposition'] = 'attachment; filename=sound.wav'
    return response
def split_on_silence_with_librosa(audio_path,
                                  top_db=40,
                                  frame_length=1024,
                                  hop_length=256,
                                  skip_idx=0,
                                  out_ext="wav",
                                  min_segment_length=3,
                                  max_segment_length=8,
                                  pre_silence_length=0,
                                  post_silence_length=0):

    filename = os.path.basename(audio_path).split('.', 1)[0]
    in_ext = audio_path.rsplit(".")[1]

    audio = load_audio(audio_path)

    edges = librosa.effects.split(audio,
                                  top_db=top_db,
                                  frame_length=frame_length,
                                  hop_length=hop_length)

    new_audio = np.zeros_like(audio)
    for idx, (start, end) in enumerate(edges[skip_idx:]):
        new_audio[start:end] = remove_breath(audio[start:end])

    save_audio(new_audio, add_postfix(audio_path, "no_breath"))
    audio = new_audio
    edges = librosa.effects.split(audio,
                                  top_db=top_db,
                                  frame_length=frame_length,
                                  hop_length=hop_length)

    audio_paths = []
    for idx, (start, end) in enumerate(edges[skip_idx:]):
        segment = audio[start:end]
        duration = get_duration(segment)

        if duration <= min_segment_length or duration >= max_segment_length:
            continue

        output_path = "{}/{}.{:04d}.{}".format(os.path.dirname(audio_path),
                                               filename, idx, out_ext)

        padded_segment = np.concatenate([
            get_silence(pre_silence_length),
            segment,
            get_silence(post_silence_length),
        ])

        save_audio(padded_segment, output_path)
        audio_paths.append(output_path)

    return audio_paths
Exemple #3
0
 def _save_data_core(self, phase='face'):
     data_dir = os.path.join(data_param['data_save_dir'], phase)
     create_dir(data_dir)
     for index in range(len(self.faces)):
         name = self.names[index]
         img_path = os.path.join(data_dir,
                                 add_postfix(name, "_{}".format(phase)))
         cv2.imwrite(img_path, self.faces[index])
         np.savetxt(os.path.splitext(img_path)[0] + ".pts",
                    self.aug_landmarks[index],
                    fmt="%.4f")
         np.savetxt(os.path.splitext(img_path)[0] + ".opts",
                    self.occlusions[index],
                    fmt="%d")
Exemple #4
0
    def _split_core(self, x, y, mode, phase):
        data_dir = os.path.join(data_param['data_save_dir'], mode)
        for index in range(len(x)):
            img = x[index][0]
            name = x[index][1]
            landmark = y[index][0] * self.img_size
            occlusion = y[index][1]

            # save data
            img_path = os.path.join(data_dir,
                                    add_postfix(name, "_{}".format(phase)))
            cv2.imwrite(img_path, img)
            np.savetxt(os.path.splitext(img_path)[0] + ".pts",
                       landmark,
                       fmt="%.4f")
            np.savetxt(os.path.splitext(img_path)[0] + ".opts",
                       occlusion,
                       fmt="%d")
Exemple #5
0
    def _balance(self, balanced_num=None):
        """Balance dataset
        Increase occlusion objs by (balanced_num + 1) times

        :param balanced_num: required balanced_num to increase nums of occlusion objs
        """
        count = 0
        for index in range(self.data_size):
            if np.sum(self.occlusions[index]) > 0:
                count += 1
        ratio = float(count) / self.data_size
        balanced_num = int(float(1) /
                           ratio) if balanced_num is None else balanced_num
        occlusions_add = []
        heatmaps_add = []
        faces_add = []
        names_add = []
        landmarks_add = []
        for index in range(len(self.occlusions)):
            if np.sum(self.occlusions[index]) > 0:
                for num in range(balanced_num):
                    heatmap = gaussian_noise(self.heat_maps[index],
                                             color=self.color)
                    heatmaps_add.append(heatmap)
                    face = gaussian_noise(self.faces[index], color=self.color)
                    faces_add.append(face)
                    occlusions_add.append(self.occlusions[index])
                    landmarks_add.append(self.aug_landmarks[index])
                    names_add.append(
                        add_postfix(self.names[index],
                                    "_gaussian_{}".format(num)))
            if self.print_debug and (index + 1) % 500 == 0:
                logger("data aug phase 2 processed {} images".format(index +
                                                                     1))
        self.faces = extend(self.faces, faces_add)
        self.occlusions.extend(occlusions_add)
        self.heat_maps.extend(heatmaps_add)
        self.aug_landmarks.extend(landmarks_add)
        self.names.extend(names_add)
        self.data_size = len(self.occlusions)
        logger("length of imgs and occlusions is {}".format(self.data_size))
def plot_graph_and_save_audio(args,
                              base_path=None,
                              start_of_sentence=None,
                              end_of_sentence=None,
                              pre_word_num=0,
                              post_word_num=0,
                              pre_surplus_idx=0,
                              post_surplus_idx=1,
                              use_short_concat=False,
                              use_manual_attention=False,
                              save_alignment=False,
                              librosa_trim=False,
                              attention_trim=False,
                              time_str=None,
                              isKorean=True):

    idx, (wav, alignment, path, text, sequence) = args

    if base_path:
        plot_path = "{}/{}.png".format(base_path, get_time())
    elif path:
        plot_path = path.rsplit('.', 1)[0] + ".png"
    else:
        plot_path = None

    #plot_path = add_prefix(plot_path, time_str)
    if use_manual_attention:
        plot_path = add_postfix(plot_path, "manual")

    if plot_path:
        plot.plot_alignment(alignment, plot_path, text=text, isKorean=isKorean)

    if use_short_concat:
        wav = short_concat(wav, alignment, text, start_of_sentence,
                           end_of_sentence, pre_word_num, post_word_num,
                           pre_surplus_idx, post_surplus_idx)

    if attention_trim and end_of_sentence:
        end_idx_counter = 0
        attention_argmax = alignment.argmax(0)
        end_idx = min(len(sequence) - 1, max(attention_argmax))
        max_counter = min((attention_argmax == end_idx).sum(), 5)

        for jdx, attend_idx in enumerate(attention_argmax):
            if len(attention_argmax) > jdx + 1:
                if attend_idx == end_idx:
                    end_idx_counter += 1

                if attend_idx == end_idx and attention_argmax[jdx +
                                                              1] > end_idx:
                    break

                if end_idx_counter >= max_counter:
                    break
            else:
                break

        spec_end_idx = hparams.reduction_factor * jdx + 3
        wav = wav[:spec_end_idx]

    audio_out = inv_spectrogram(wav.T)

    if librosa_trim and end_of_sentence:
        yt, index = librosa.effects.trim(audio_out,
                                         frame_length=5120,
                                         hop_length=256,
                                         top_db=50)
        audio_out = audio_out[:index[-1]]

    if save_alignment:
        alignment_path = "{}/{}.npy".format(base_path, idx)
        np.save(alignment_path, alignment, allow_pickle=False)

    if path or base_path:
        if path:
            current_path = add_postfix(path, idx)
        elif base_path:
            current_path = plot_path.replace(".png", ".wav")

        save_audio(audio_out, current_path)
        return True
    else:
        io_out = io.BytesIO()
        save_audio(audio_out, io_out)
        result = io_out.getvalue()
        return result
def plot_graph_and_save_audio(args,
							  base_path=None,
							  start_of_sentence=None, end_of_sentence=None,
							  pre_word_num=0, post_word_num=0,
							  pre_surplus_idx=0, post_surplus_idx=1,
							  save_alignment=False,
							  librosa_trim=False, attention_trim=False,
							  time_str=None, isKorean=True, config=None):
	idx, (wav, alignment, path, text, sequence, mel) = args

	if base_path:
		plot_path = "{}/{}_{}.png".format(base_path, config.file.split('.')[0], idx)
	elif path:
		plot_path = path.rsplit('.', 1)[0] + ".png"
	else:
		plot_path = None

	if plot_path:
		plot.plot_alignment(alignment, plot_path, text=text, isKorean=isKorean)

	if attention_trim and end_of_sentence:
		# attention이 text의 마지막까지 왔다면, 그 뒷부분은 버린다.
		end_idx_counter = 0
		attention_argmax = alignment.argmax(
			0)  # alignment: text length(encoder), target length(decoder)   ==> target length(decoder)
		end_idx = min(len(sequence) - 1, max(attention_argmax))
		max_counter = min((attention_argmax == end_idx).sum(), 5)

		for jdx, attend_idx in enumerate(attention_argmax):
			if len(attention_argmax) > jdx + 1:
				if attend_idx == end_idx:
					end_idx_counter += 1

				if attend_idx == end_idx and attention_argmax[jdx + 1] > end_idx:
					break

				if end_idx_counter >= max_counter:
					break
			else:
				break

		spec_end_idx = hparams.reduction_factor * jdx + 3
		wav = wav[:spec_end_idx]
		mel = mel[:spec_end_idx]

	audio_out = inv_linear_spectrogram(wav.T, hparams)

	if librosa_trim and end_of_sentence:
		yt, index = librosa.effects.trim(audio_out, frame_length=5120, hop_length=256, top_db=50)
		audio_out = audio_out[:index[-1]]
		mel = mel[:index[-1] // hparams.hop_size]

	if save_alignment:
		alignment_path = "{}/{}.npy".format(base_path, idx)
		np.save(alignment_path, alignment, allow_pickle=False)

	if path or base_path:
		if path:
			current_path = add_postfix(path, idx)
		elif base_path:
			current_path = plot_path.replace(".png", ".wav")

		save_wav(audio_out, current_path, hparams.sample_rate)

		# hccho
		mel_path = current_path.replace(".wav", ".npy")
		np.save(mel_path, mel)
		return current_path
	else:
		io_out = io.BytesIO()
		save_wav(audio_out, io_out, hparams.sample_rate)
		result = io_out.getvalue()
		return io_out
def plot_graph_and_save_audio(args,
                              base_path=None,
                              start_of_sentence=None,
                              end_of_sentence=None,
                              pre_word_num=0,
                              post_word_num=0,
                              pre_surplus_idx=0,
                              post_surplus_idx=1,
                              use_short_concat=False,
                              save_alignment=False,
                              librosa_trim=False,
                              attention_trim=False,
                              time_str=None,
                              isKorean=True):

    idx, (wav, alignment, path, text, sequence, mel) = args

    if base_path:
        plot_path = "{}/{}.png".format(base_path, get_time())
    elif path:
        plot_path = path.rsplit('.', 1)[0] + ".png"
    else:
        plot_path = None

    if plot_path:
        plot.plot_alignment(alignment, plot_path, text=text, isKorean=isKorean)

    if use_short_concat:
        wav = short_concat(wav, alignment, text, start_of_sentence,
                           end_of_sentence, pre_word_num, post_word_num,
                           pre_surplus_idx, post_surplus_idx)

    if attention_trim and end_of_sentence:
        # attention이 text의 마지막까지 왔다면, 그 뒷부분은 버린다.
        end_idx_counter = 0
        attention_argmax = alignment.argmax(
            0
        )  # alignment: text length(encoder), target length(decoder)   ==> target length(decoder)
        end_idx = min(len(sequence) - 1, max(attention_argmax))
        # max_counter = min((attention_argmax == end_idx).sum(), 5) + 1
        # 20200612 위 로직을 보면 attention_argmax에서 end_idx랑 같은 값을 count한 거(실제 끝 값)랑 5를 min해서 max_counter를 정하게 되어 있다.
        #          한국말은 끝음을 오래 발음하는 경향이 있기 때문에 5로 자르지 않고 실제 발음한거만큼 끝까지 사용할 필요가 있어서 아래 로직으로 교체한다.
        #          (설계자가 왜 5로 잘랐는지는 미지수)
        max_counter = (attention_argmax == end_idx).sum()

        for jdx, attend_idx in enumerate(attention_argmax):
            if len(attention_argmax) > jdx + 1:
                if attend_idx == end_idx:
                    end_idx_counter += 1

                if attend_idx == end_idx and attention_argmax[jdx +
                                                              1] > end_idx:
                    break

                if end_idx_counter >= max_counter:
                    break
            else:
                break

        spec_end_idx = hparams.reduction_factor * jdx + 3
        wav = wav[:spec_end_idx]
        mel = mel[:spec_end_idx]

    audio_out = inv_linear_spectrogram(wav.T, hparams)

    if librosa_trim and end_of_sentence:
        yt, index = librosa.effects.trim(audio_out,
                                         frame_length=5120,
                                         hop_length=256,
                                         top_db=50)
        audio_out = audio_out[:index[-1]]
        mel = mel[:index[-1] // hparams.hop_size]

    if save_alignment:
        alignment_path = "{}/{}.npy".format(base_path, idx)
        np.save(alignment_path, alignment, allow_pickle=False)

    if path or base_path:
        if path:
            current_path = add_postfix(path, idx)
        elif base_path:
            current_path = plot_path.replace(".png", ".wav")

        save_wav(audio_out, current_path, hparams.sample_rate)

        #hccho
        mel_path = current_path.replace(".wav", ".npy")
        np.save(mel_path, mel)

        #return True
        return audio_out
    else:
        io_out = io.BytesIO()
        save_wav(audio_out, io_out, hparams.sample_rate)
        result = io_out.getvalue()

        return audio_out
Exemple #9
0
def generate_audio_response(textList, speaker_id, alarm_id):
    #global global_config
    #model_name = os.path.basename(global_config.load_path)
    #iskorean=global_config.is_korean
    audio_clear()

    global member_id, method_id
    if member_id != speaker_id:
        
        if speaker_id == 0:
            if not (member_id==0):
                if member_id != -1:
                    synthesizer.close()
                synthesizer.load('logs/backup_log/son+yuinna', 2)
        elif speaker_id == 3:
            if not (member_id==3):
                if member_id != -1:
                    synthesizer.close()
                synthesizer.load('logs/backup_log/new_inna+kss+leejh+nandong2',4)
        else:
            if not (member_id==1 or member_id==2 or member_id==4):
                if member_id != -1:
                    synthesizer.close()
                synthesizer.load('logs/backup_log/new_inna+kss+leejh', 3)
       
        member_id = speaker_id      

    if speaker_id==0:
        model_name='손석희'
        #speaker_id=0        
    elif speaker_id==1:
        model_name='유인나'
        speaker_id=0
    elif speaker_id==2:
        model_name='코퍼스' #한국어 코퍼스
        speaker_id=1
    elif speaker_id==3:
        model_name='김난희'
        #speaker_id=3
    else:
        model_name='이주형'
        speaker_id=2

    ###########################################################################################
    # 이 부분 반목문 돌림

    textcnt = 0 # 몇번째 텍스트인지 확인 용도
    audio_list = [] #체크 용도
    print(textList)
    for text in textList:
        # hashed_text = hashlib.md5(text.encode('utf-8')).hexdigest() # 텍스트 
        hashed_text = "{}".format(str(textcnt))
        
        # 이 부분을 반복문
        # 이 부분이 경로 생성 하는 부분
        relative_dir_path = os.path.join(AUDIO_DIR, model_name)
        relative_audio_path = os.path.join(
                relative_dir_path, "{}.{}.wav".format(hashed_text, speaker_id))
        real_path = os.path.join(ROOT_PATH, relative_audio_path)
        
        makedirs(os.path.dirname(real_path))
        
        if not os.path.exists(add_postfix(real_path, 0)):
            try:
                #audio는 파일명임
                audio = synthesizer.synthesize(
                        [text], paths=[real_path], speaker_ids=[speaker_id],
                        attention_trim=True)[0]
                audio_list.append(audio)
            except:
                return jsonify(success=False), 400
            
        
        textcnt +=1
    
    ###########################################################################################

    # 음성 합치기
    # 합친 음성 이름은 'output.wav'
    CUR_PATH = os.getcwd()
    #print(CUR_PATH) # audio 이름 체크용
    FILE_PATH = os.path.join(AUDIO_PATH, model_name)
    #print(FILE_PATH) # audio 이름 체크용
    print("method {} 실행중".format(method_id))
    alarm_type = 0
    alarm_id -= 1

    if (method_id == 1) or (method_id == 2):  # basic
        combine_audio(os.path.join(CUR_PATH, FILE_PATH))
    elif method_id == 3:  # morning_call
        combine_audio(os.path.join(CUR_PATH, FILE_PATH))  # web\audio\model_name\output.wav
        if alarm_id == 0 or alarm_id == 1 or alarm_id == 2 or alarm_id == 3:
            alarm_type = 0
        else:
            alarm_id = (alarm_id - 4)
            alarm_type = 1
        create_alarm(alarm_id, model_name, alarm_type) # bgm_select, model_name, type
    elif method_id == 4:  # briefing
        combine_audio(os.path.join(CUR_PATH, FILE_PATH))  # web\audio\model_name\output.wav
        create_briefing(alarm_id, model_name) # bgm_select, model_name, #0 1 2 3
    elif method_id == 5:  # birthday
        combine_audio(os.path.join(CUR_PATH, FILE_PATH))  # web\audio\model_name\output.wav
        create_birthday(0, model_name) # bgm_select, model_name, #0 1 2 3

    #print(os.path.join(CUR_PATH, FILE_PATH))
    #print(TEST_PATH)

    ###########################################################################################

       return send_file(
        os.path.join('audio', model_name, 'output.wav'),
        mimetype="audio/wav",
        as_attachment=True,
        attachment_filename=hashed_text + ".wav")

    ###########################################################################################

    # 합친 파일 불러와서 audio에 넣기
    response = make_response(os.path.join('web', 'audio', model_name, 'output.wav'))
    response.headers['Content-Type'] = 'audio/wav'
    response.headers['Content-Disposition'] = 'attachment; filename=sound.wav'
    return response