Ejemplo n.º 1
0
 def convert_image(self, image_path):
     img = cv2.imread(image_path)
     text = pytesseract.image_to_string(img, lang='eng', config='--psm 6')
     image_path = os.path.normpath(image_path)
     file_name = image_path.split(os.sep)[-2]
     tag_name = image_path.split(os.sep)[-1].split(".")[0]
     file_path = os.path.join(self._text_out_dir, file_name)
     if not os.path.exists(file_path):
         os.makedirs(file_path)
     text_file_path = os.path.join(file_path, tag_name + ".txt")
     print_error(text_file_path)
     fd = open(text_file_path, "w")
     fd.write("%s" % text)
     return text_file_path
Ejemplo n.º 2
0
    def __init__(self, hparams=None, data_iterator=None):
        """
        https://arxiv.org/abs/1508.04306
        :param hparams:
        :param data_iterator:
        """
        # ITextFeature.__init__(self)
        ModelBase.__init__(self, hparams=hparams)
        ShabdaWavPairFeature.__init__(self)
        self._hparams = HParams(hparams, self.default_hparams())

        print_error(self._hparams)
        self.lstm_hidden_size = self._hparams.lstm_hidden_size
        self.batch_size = self._hparams.batch_size
        self.p_keep_ff = self._hparams.p_keep_ff
        self.p_keep_rc = self._hparams.p_keep_rc
        self.neff = self._hparams.neff
        self.embd_dim_k = self._hparams.embd_dim_k
        self.frames_per_sample = self._hparams.frames_per_sample
        self.weights = None
        self.biases = None
Ejemplo n.º 3
0
def _get_speech_features(wav_file_1, wav_file_2, sampling_rate, frame_size,
                         neff, amp_fac, min_amp, threshold, global_mean,
                         global_std, frames_per_sample):
    """


    :param args:
    :return:
    """
    # (wav_file_1, wav_file_2, sampling_rate, frame_size, neff, amp_fac, min_amp,
    #  threshold, global_mean, global_std, frames_per_sample) = args

    try:
        # TODO: experimental though - is multi-processing required here? to reduce IO
        # print_info("{} {}".format(wav_file_1, wav_file_2))
        def _get_data():
            speech_1 = _get_speech_data(wav_file_1, sampling_rate)
            speech_2 = _get_speech_data(wav_file_2, sampling_rate)

            # print(wav_file_1, wav_file_2)

            # find the minimum length of two speeches
            length = min(len(speech_1), len(speech_2))
            #trim both the speeches to the minimum length
            speech_1 = speech_1[:length]
            speech_2 = speech_2[:length]
            # mix the signals
            speech_mix = speech_1 + speech_2

            #get the spectral features in dB
            speech_1_features = _get_log_spectrum_features(
                speech_1, frame_size, neff, amp_fac, min_amp)
            speech_2_features = _get_log_spectrum_features(
                speech_2, frame_size, neff, amp_fac, min_amp)
            speech_mix_features = _get_log_spectrum_features(
                speech_mix, frame_size, neff, amp_fac, min_amp)

            max_mag = np.max(speech_mix_features)
            # apply threshold to the feature signal, to find the silent portion of the signal and
            # construct a boolean array as a feature
            # https://en.wikipedia.org/wiki/Voice_activity_detection
            speech_voice_activity_detection = (speech_mix_features >
                                               (max_mag - threshold))
            # normalize the signal values with given global mean and std
            speech_mix_features_final = (speech_mix_features -
                                         global_mean) / global_std

            number_frames = speech_1_features.shape[0]

            new_data = _get_speech_samples(speech_1_features,
                                           speech_2_features,
                                           frames_per_sample, number_frames,
                                           speech_mix_features_final,
                                           speech_voice_activity_detection)
            # print_error("deleting speech_1_features, speech_2_features, speech_voice_activity_detection, speech_mix_features")
            speech_1_features = speech_2_features = speech_voice_activity_detection = speech_mix_features = speech_mix_features_final = None
            return new_data

        new_data = _get_data()
    except (KeyboardInterrupt, SystemExit):
        raise
    except Exception as err:
        print_error(traceback.print_exc())
        new_data = []

    return new_data
    def generate_features(self, wav_file_1, wav_file_2):

        try:
            start = time.time()
            speech_1, _ = librosa.core.load(wav_file_1,
                                            sr=self._hparams.sampling_rate)
            # amp factor between -3 dB - 3 dB
            fac = np.random.rand(1)[0] * 6 - 3
            speech_1 = 10.**(fac / 20) * speech_1

            speech_2, _ = librosa.core.load(wav_file_2,
                                            sr=self._hparams.sampling_rate)
            fac = np.random.rand(1)[0] * 6 - 3
            speech_2 = 10.**(fac / 20) * speech_2

            # mix
            length = min(len(speech_1), len(speech_2))
            speech_1 = speech_1[:length]
            speech_2 = speech_2[:length]
            speech_mix = speech_1 + speech_2

            # compute log spectrum for 1st speaker
            speech_1_features = np.abs(
                stft(speech_1,
                     self._hparams.frame_size)[:, :self._hparams.neff])
            speech_1_features = np.maximum(
                speech_1_features,
                np.max(speech_1_features) / self._hparams.min_amp)
            speech_1_features = 20. * np.log10(
                speech_1_features * self._hparams.amp_fac)

            # same for the 2nd speaker
            speech_2_features = np.abs(
                stft(speech_2,
                     self._hparams.frame_size)[:, :self._hparams.neff])
            speech_2_features = np.maximum(
                speech_2_features,
                np.max(speech_2_features) / self._hparams.min_amp)
            speech_2_features = 20. * np.log10(
                speech_2_features * self._hparams.amp_fac)

            # same for the mixture
            speech_mix_spec0 = stft(
                speech_mix, self._hparams.frame_size)[:, :self._hparams.neff]
            speech_mix_features = np.abs(speech_mix_spec0)

            # speech_phase = speech_mix_spec0 / speech_mix_spec
            speech_mix_features = np.maximum(
                speech_mix_features,
                np.max(speech_mix_features) / self._hparams.min_amp)
            speech_mix_features = 20. * np.log10(
                speech_mix_features * self._hparams.amp_fac)
            max_mag = np.max(speech_mix_features)

            # if np.isnan(max_mag):
            # import ipdb; ipdb.set_trace()
            speech_VAD = (speech_mix_features >
                          (max_mag - self._hparams.threshold)).astype(int)

            speech_mix_features = (
                speech_mix_features -
                self._hparams.global_mean) / self._hparams.global_std

            #The ideal binary mask gives ownership of a time-frequency bin to the source whose magnitude is
            # maximum among all sources in that bin.
            # The mask values were assigned with 1 for active and 0 otherwise (binary),
            # making Y x Y^T as the ideal affinity matrix for the mixture.
            Y = np.array([
                speech_1_features > speech_2_features,
                speech_1_features < speech_2_features
            ]).astype('bool')
            Y = np.transpose(Y, [1, 2, 0]).astype('bool')

            # speech_mix_features = speech_mix_features[0:self._hparams.dummy_slicing_dim, :]
            # speech_VAD = speech_VAD[0:self._hparams.dummy_slicing_dim, :]
            # Y = Y[0:self._hparams.dummy_slicing_dim, :, :]

            # print_info("{} vs {}".format(wav_file_1, wav_file_2))
            end = time.time()

            print_info("Thread name: {} : took {}".format(
                threading.currentThread().getName(), end - start))

            if speech_mix_features.shape[0] != 1247 or speech_VAD.shape[
                    0] != 1247 or Y.shape[0] != 1247:
                raise Exception("Found files with improper duration/data")

            return speech_mix_features.astype('float32'), speech_VAD.astype(
                'bool'), Y.astype('bool')
        except Exception as e:
            print_warn(e)
            print_error("{} vs {}".format(wav_file_1, wav_file_2))
            return np.random.random((self._hparams.dummy_slicing_dim,129)).astype('float32'), \
                   np.empty((self._hparams.dummy_slicing_dim,129), dtype="bool"), \
                   np.empty((self._hparams.dummy_slicing_dim,129, 2), dtype="bool")
Ejemplo n.º 5
0
    def discriminator(self, x, out_channel_dim, is_training=True, reuse=False):
        # It must be Auto-Encoder style architecture
        # Architecture : (64)4c2s-FC32_BR-FC64*14*14_BR-(1)4dc2s_S
        with tf.variable_scope("namespace_discriminator", reuse=reuse):
            # net = tf.nn.relu(conv2d(x, 64, 4, 4, 2, 2, name='d_conv1'))
            net = tf.layers.conv2d(
                x,
                64,
                4,
                strides=2,
                padding='same',
                kernel_initializer=tf.random_normal_initializer(stddev=0.02),
                name='d_conv1')
            net = tf.nn.relu(net)

            tf.logging.info("======> net: {}".format(net))
            print_error("net1: {} ".format(net))

            size = (self.image_size // 2)

            net = tf.reshape(
                net, [self._data_iterator.batch_size, size * size * 64])

            # code = tf.nn.relu(bn(linear(net, 32, scope='d_fc6'), is_training=is_training, scope='d_bn6'))
            code = tf.contrib.layers.fully_connected(inputs=net,
                                                     num_outputs=32,
                                                     scope="d_fc6")
            code = tf.contrib.layers.batch_norm(code,
                                                decay=0.9,
                                                updates_collections=None,
                                                epsilon=1e-5,
                                                scale=True,
                                                is_training=is_training,
                                                scope='d_bn6')
            code = tf.nn.relu(code)

            print_error("code: {} ".format(code))
            # net = tf.nn.relu(bn(linear(code, 64 * 14 * 14, scope='d_fc3'), is_training=is_training, scope='d_bn3'))
            size = (self.image_size // 2)
            net = tf.contrib.layers.fully_connected(inputs=code,
                                                    num_outputs=64 * size *
                                                    size,
                                                    scope="d_fc3")

            net = tf.contrib.layers.batch_norm(net,
                                               decay=0.9,
                                               updates_collections=None,
                                               epsilon=1e-5,
                                               scale=True,
                                               is_training=is_training,
                                               scope='d_bn3')
            print_error("net: {} ".format(net))
            print_error(net)

            size = (self.image_size // 2)
            net = tf.reshape(net,
                             [self._data_iterator.batch_size, size, size, 64])
            print_error(net)

            # out = tf.nn.sigmoid(deconv2d(net, [self.gan_config.batch_size, 28, 28, 1], 4, 4, 2, 2, name='d_dc5'))
            net = tf.layers.conv2d_transpose(net,
                                             out_channel_dim,
                                             4,
                                             strides=2,
                                             padding='same',
                                             name='d_dc5')
            out = tf.nn.sigmoid(net)

            print_info("==================================")
            print_info(out)
            print_info(x)
            # recon loss
            recon_error = tf.sqrt(
                2 * tf.nn.l2_loss(out - x)) / self._data_iterator.batch_size
            print_info("==================================")
            print_error(recon_error)

            return out, recon_error, code
Ejemplo n.º 6
0
    def visulaize(self, executor, file_path):
        """

        :param executor:
        :param test_file_path:
        :return:
        """

        estimator = executor.estimator

        in_data_features, voice_activity_detection_data_features, phase_features = self._get_predict_samples(
            file_path=file_path)

        in_data_features = np.asarray(in_data_features)
        voice_activity_detection_data_features = np.asarray(
            voice_activity_detection_data_features)

        N_frames = in_data_features.shape[0]
        hop_size = self._hparams.frame_size // 4

        def get_dataset():
            dataset = tf.data.Dataset.from_tensor_slices(({
                self.FEATURE_1_NAME:
                in_data_features,
                self.FEATURE_2_NAME:
                voice_activity_detection_data_features
            }, np.ones_like(in_data_features)))
            dataset = dataset.batch(batch_size=1)
            print_info(dataset.output_shapes)
            return dataset

        predict_fn = estimator.predict(input_fn=lambda: get_dataset())

        print_info("Shape of in data: {}".format(in_data_features.shape))
        print_info("Number of frames for given file: {}".format(N_frames))

        embeddings = []
        i = 0

        for predicted_value in predict_fn:
            # print("i = {}".format(i))
            """
            TODO:
            strange behaviour!
            
            1 wav file = N samples
            Eg: N = 600
            FramesPerSample=100, BatchSize = 1, NEFF = 129, EMD_K = 30
            
            For each sample the embeddings is of shape [batch_size * frames_per_sample, NEFF, embd_dim].
            For prediction batch size is made 1.
            Hence the embeddings colapse to [frames_per_sample, NEFF, embd_dim]
            1 sample predictions will have `frames_per_sample` outputs
            Eg: If input audio file has 75 frames, the prediction will have [7500, NEFF, embd_dim]
            """
            embeddings.append(predicted_value)
            i += 1

        print_info("Number of embeddings predicted for given file: {}".format(
            len(embeddings)))
        print_error(np.asarray(embeddings).shape)

        N_assign = 0
        step = 0

        for frame_i in tqdm(range(N_frames)):

            # expand the dimesion to be inline with TF batch size
            in_data_np = np.expand_dims(in_data_features[frame_i], axis=0)
            in_phase_np = np.expand_dims(phase_features[frame_i], axis=0)
            voice_activity_detection_data_np = np.expand_dims(
                voice_activity_detection_data_features[frame_i], axis=0)
            embedding_np = np.asarray(
                embeddings[frame_i:frame_i + self._hparams.frames_per_sample])

            # ----------------------------------------------

            embedding_ac = []
            for i, j in itertools.product(
                    range(self._hparams.frames_per_sample),
                    range(self._hparams.neff)):
                if voice_activity_detection_data_np[0, i, j] == 1:
                    embedding_ac.append(embedding_np[i, j, :])

            kmean = KMeans(n_clusters=2, random_state=0).fit(embedding_ac)
            # visualization using 3 PCA
            pca_Data = PCA(n_components=3).fit_transform(embedding_ac)
            fig = plt.figure(1, figsize=(8, 6))
            ax = Axes3D(fig, elev=-150, azim=110)
            # ax.scatter(pca_Data[:, 0], pca_Data[:, 1], pca_Data[:, 2],
            #            c=kmean.labels_, cmap=plt.cm.Paired)
            ax.scatter(pca_Data[:, 0],
                       pca_Data[:, 1],
                       pca_Data[:, 2],
                       cmap=plt.cm.Paired)
            ax.set_title('Embedding visualization using the first 3 PCs')
            ax.set_xlabel('1st pc')
            ax.set_ylabel('2nd pc')
            ax.set_zlabel('3rd pc')
            if not os.path.exists("vis"):
                os.makedirs("vis")
            plt.savefig('vis/' + str(step) + 'pca.jpg')

            step += 1
Ejemplo n.º 7
0
    def discriminator(self, images, input_z, reuse=False):
        """
        Create the _discriminator network
        :param image: Tensor of input image(s)
        :param reuse: Boolean if the weights should be reused
        :return: Tuple of (tensor output of the _discriminator, tensor logits of the _discriminator)
        """

        _, width, height, channel = images.get_shape().as_list()

        print_error("{}".format([width, height, channel]))

        with tf.variable_scope('_discriminator', reuse=reuse):

            # y = tf.reshape(input_z, [-1, 1, 1, 740], name="y_reshape") #2*2*185=>740
            # input_z = tf.layers.batch_normalization(input_z)
            # input_z = tf.layers.dense(input_z, width*height*channel)
            # y = tf.reshape(input_z, [-1, width,height,channel], name="y_reshape")
            # print_error(y)

            # x1 = conv_cond_concat(y,images)
            '''
            # c_code = tf.expand_dims(tf.expand_dims(input_z, 1), 1)
            # c_code = tf.tile(c_code, [1, 1, 1, 740])
            # x1 = tf.concat([images, c_code], 3)
            # print_error(x1)
            '''
            # x1 = tf.layers.batch_normalization(images)

            # Input layer consider ?x32x32x3
            x1 = tf.layers.conv2d(
                images,
                64,
                5,
                strides=2,
                padding='same',
                kernel_initializer=tf.truncated_normal_initializer(
                    stddev=0.02))
            x1 = tf.maximum(self.alpha * x1, x1)

            x1 = tf.layers.batch_normalization(x1, training=True)
            # relu1 = tf.layers.dropout(relu1, rate=0.5)
            # 16x16x64
            #         print(x1)
            x2 = tf.layers.conv2d(
                x1,
                128,
                5,
                strides=2,
                padding='same',
                kernel_initializer=tf.truncated_normal_initializer(
                    stddev=0.02))
            x2 = tf.maximum(self.alpha * x2, x2)

            x2 = tf.layers.batch_normalization(x2, training=True)
            # relu2 = tf.layers.dropout(relu2, rate=0.5)
            # 8x8x128
            #         print(x2)
            x3 = tf.layers.conv2d(
                x2,
                256,
                5,
                strides=2,
                padding='same',
                kernel_initializer=tf.truncated_normal_initializer(
                    stddev=0.02))
            x3 = tf.maximum(self.alpha * x3, x3)

            x3 = tf.layers.batch_normalization(x3, training=True)
            # relu3 = tf.layers.dropout(relu3, rate=0.5)
            # 4x4x256
            #         print(x3)
            # Flatten it
            flat = tf.reshape(x3, (-1, 4 * 4 * 256))

            flat = tf.concat([flat, input_z], -1)

            # conditioned_fully_connected_layer = tf.concat([flat], axis=-1)
            #
            # flat = tf.layers.dense(flat, 512)
            # flat = tf.layers.dense(flat, 1024)
            # flat = tf.layers.dense(flat, 512)
            # flat = tf.layers.dense(flat, 256)
            # flat = tf.layers.dense(flat, 128)

            logits = tf.layers.dense(flat, 512)
            logits = tf.maximum(self.alpha * logits, logits)
            logits = tf.layers.dense(logits, 1)

            #         print(logits)
            out = tf.sigmoid(logits)
            #         print('_discriminator out: ', out)

            print_info("======>out: {}".format(out))

            return out, logits
Ejemplo n.º 8
0
    def predict_on_instance(self, executor, file_path):
        """
        Given a mixed audio file, it generates three audio files : mix recontructed, source1, source 2
        #TODO debug!!!
        :param executor:
        :param test_file_path:
        :return:
        """

        estimator = executor.estimator

        in_data_features, voice_activity_detection_data_features, phase_features = self._get_predict_samples(
            file_path=file_path)

        print_info("in_data_features original shape: {}".format(
            in_data_features.shape))

        voice_activity_detection_data_features = np.asarray(
            voice_activity_detection_data_features)

        num_samples_N = in_data_features.shape[0]
        hop_size = self._hparams.frame_size // 4  # 256 / 4 = 64

        # (38 * 100 -1) * 64 + 256 = 243392 for wav file of 10 seconds appended 3 three times i.e 10 * 8000 (Sampling rate) * = 240000
        out_audio1 = np.zeros([
            (num_samples_N * self._hparams.frames_per_sample - 1) * hop_size +
            self._hparams.frame_size
        ])
        out_audio2 = np.zeros([
            (num_samples_N * self._hparams.frames_per_sample - 1) * hop_size +
            self._hparams.frame_size
        ])
        mix = np.zeros([
            (num_samples_N * self._hparams.frames_per_sample - 1) * hop_size +
            self._hparams.frame_size
        ])

        def get_dataset():
            dataset = tf.data.Dataset.from_tensor_slices(({
                self.FEATURE_1_NAME:
                in_data_features,
                self.FEATURE_2_NAME:
                voice_activity_detection_data_features
            }, np.ones_like(in_data_features)))
            dataset = dataset.batch(batch_size=1)
            print_info(dataset.output_shapes)
            return dataset

        predict_fn = estimator.predict(input_fn=lambda: get_dataset())

        print_info("Shape of in data: {}".format(in_data_features.shape))
        print_info("Number of sample for given file: {}".format(num_samples_N))

        embeddings = []
        i = 0

        for predicted_value in predict_fn:
            # print("i = {}".format(i))
            """
            TODO:
            strange behaviour!
            
            1 wav file = N samples
            Eg: N = 600
            FramesPerSample=100, BatchSize = 1, NEFF = 129, EMD_K = 30
            
            For each sample the embeddings is of shape [batch_size * frames_per_sample, NEFF, embd_dim].
            For prediction batch size is made 1.
            Hence the embeddings colapse to [frames_per_sample, NEFF, embd_dim]
            1 sample predictions will have `frames_per_sample` outputs
            Eg: If input audio file has 75 frames, the prediction will have [7500, NEFF, embd_dim]
            """
            embeddings.append(predicted_value)
            i += 1

        # Number of embeddings predicted for given file: 3800 with shape (3800, 129, 40) [number of samples * frames_per_sample, NEFF, EMBD_DIM]
        print_info(
            "Number of embeddings predicted for given file: {} with shape {}".
            format(len(embeddings),
                   np.asarray(embeddings).shape))

        N_assign = 0
        # # for every chunk of frames of data
        for sample_i in tqdm(range(num_samples_N)):  # num_samples = 38

            # expand the dimesion to be inline with TF batch size
            in_data_np = np.expand_dims(in_data_features[sample_i], axis=0)
            in_phase_np = np.expand_dims(phase_features[sample_i], axis=0)
            voice_activity_detection_data_np = np.expand_dims(
                voice_activity_detection_data_features[sample_i], axis=0)
            print_info("in_data_np : ")
            print_error(in_data_np.shape)
            """
            0*100 to (0+1)*100 = 0   to 100
            1*100 to (1+1)*100 = 100 to 200
            2*100 to (2+1)*100 = 200 to 300
            
            """
            embedding_np = np.asarray(
                embeddings[sample_i *
                           self._hparams.frames_per_sample:(sample_i + 1) *
                           self._hparams.frames_per_sample])

            # ----------------------------------------------

            # embedding_ac = []
            # for i, j in itertools.product(range(self._hparams.frames_per_sample), range(self._hparams.neff)):
            #     if voice_activity_detection_data_np[0, i, j] == 1:
            #         embedding_ac.append(embedding_np[i, j, :])

            embedding_ac = [
                embedding_np[i, j, :] for i, j in itertools.product(
                    range(self._hparams.frames_per_sample),
                    range(self._hparams.neff))
                if voice_activity_detection_data_np[0, i, j] == 1
            ]
            print_error(np.array(embedding_ac).shape)
            if embedding_ac == []:
                break
            kmean = KMeans(n_clusters=2, random_state=0).fit(embedding_ac)

            # ----------------------------------------------

            mask = np.zeros(
                [self._hparams.frames_per_sample, self._hparams.neff, 2])
            ind = 0

            # print_info("N_assign : {}".format(N_assign))

            center = kmean.cluster_centers_
            center = center * 0.7 + 0.3 * kmean.cluster_centers_
            # two speakers have appeared
            # print_info("Found 2 speakers ...")
            center_new = kmean.cluster_centers_
            cor = np.matmul(center_new[0, :], np.transpose(center))
            # rearrange their sequence if not consistant with previous
            # frames
            # print_info("Correlation : {}".format(cor))

            if (cor[1] > cor[0]):
                kmean.cluster_centers_ = np.array(
                    [kmean.cluster_centers_[1], kmean.cluster_centers_[0]])
                kmean.labels_ = (kmean.labels_ == 0).astype('int')

            kmean.labels_ = (~kmean.labels_).astype('int')

            # print_info("center : {}".format(center))
            # print_info("kmean.labels_ : {}".format(kmean.labels_))

            # ----------------------------------------------
            # print_error(kmean.labels_)
            # transform the clustering result and voice_activity_detection info. into masks
            for i in range(self._hparams.frames_per_sample):
                for j in range(self._hparams.neff):
                    if voice_activity_detection_data_np[0, i, j] == 1:
                        mask[i, j, kmean.labels_[ind]] = 1
                        ind += 1

            for i in range(self._hparams.frames_per_sample):
                # apply the mask and reconstruct the waveform
                tot_ind = sample_i * self._hparams.frames_per_sample + i
                # ipdb.set_trace()
                amp = in_data_np[
                    0,
                    i, :] * self._hparams.global_std + self._hparams.global_mean

                out_data1 = (mask[i, :, 0] * amp *
                             voice_activity_detection_data_np[0, i, :])
                out_data2 = (mask[i, :, 1] * amp *
                             voice_activity_detection_data_np[0, i, :])
                out_mix = amp

                out_data1_l = 10**(out_data1 / 20) / self._hparams.amp_fac
                out_data2_l = 10**(out_data2 / 20) / self._hparams.amp_fac
                out_mix_l = 10**(out_mix / 20) / self._hparams.amp_fac

                out_stft1 = out_data1_l * in_phase_np[0, i, :]
                out_stft2 = out_data2_l * in_phase_np[0, i, :]
                out_stft_mix = out_mix_l * in_phase_np[0, i, :]

                con_data1 = out_stft1[-2:0:-1].conjugate()
                con_data2 = out_stft2[-2:0:-1].conjugate()
                con_mix = out_stft_mix[-2:0:-1].conjugate()

                out1 = np.concatenate((out_stft1, con_data1))
                out2 = np.concatenate((out_stft2, con_data2))
                out_mix = np.concatenate((out_stft_mix, con_mix))

                frame_out1 = np.fft.ifft(out1).astype(np.float64)
                frame_out2 = np.fft.ifft(out2).astype(np.float64)
                frame_mix = np.fft.ifft(out_mix).astype(np.float64)

                start = tot_ind * hop_size
                out_audio1[start:(start +
                                  len(frame_out1))] += frame_out1 * 0.5016
                out_audio2[start:(start +
                                  len(frame_out2))] += frame_out2 * 0.5016
                mix[start:(start + len(frame_mix))] += frame_mix * 0.5016

        ## the audio has been padded 3 times in AudioReader
        ## restore the original audio
        len1 = len(out_audio1) // 3
        len2 = len(out_audio2) // 3
        source1 = out_audio1[len1:2 * len1]
        source2 = out_audio2[len2:2 * len2]
        mix = mix[len2:2 * len2]

        # Length of source_1 81130 source_1 shape (81130,)
        # Length of source_1 81130 source_1 shape (81130,)

        print_info("Length of source_1 {} source_1 shape {}".format(
            len1, source1.shape))
        print_info("Length of source_1 {} source_1 shape {}".format(
            len2, source2.shape))
        print_info("Writing file {}".format(
            os.path.splitext(file_path)[0] + "_source1.wav"))
        print_info("Writing file {}".format(
            os.path.splitext(file_path)[0] + "_source2.wav"))

        librosa.output.write_wav(
            os.path.splitext(file_path)[0] + "_source1.wav", source1,
            self._hparams.sampling_rate)
        librosa.output.write_wav(
            os.path.splitext(file_path)[0] + "_source2.wav", source2,
            self._hparams.sampling_rate)
        librosa.output.write_wav(
            os.path.splitext(file_path)[0] + "_full.wav", mix,
            self._hparams.sampling_rate)
        return [(source1, self._hparams.sampling_rate),
                (source2, self._hparams.sampling_rate)]