def merge_label(self, binary_label_file_list, new_feat_file_list,
                    out_feat_file_list):
        """
            merging additional label for each utterance. 
        """
        utt_number = len(new_feat_file_list)
        if utt_number != len(binary_label_file_list):
            print(
                "the number of new feature input files and label files should be the same!\n"
            )
            sys.exit(1)

        io_funcs = BinaryIOCollection()
        for i in range(utt_number):

            lab_file_name = binary_label_file_list[i]
            new_feat_file_name = new_feat_file_list[i]
            out_feat_file_name = out_feat_file_list[i]

            lab_features, lab_frame_number = io_funcs.load_binary_file_frame(
                lab_file_name, self.lab_dim)
            # shape of new feature shoule be (1, dim)
            new_features = io_funcs.load_binary_file(new_feat_file_name,
                                                     self.feat_dim)
            # expand shape of new feature to (T, dim)
            new_features = numpy.tile(new_features, (lab_frame_number, 1))
            merged_features = numpy.zeros(
                (lab_frame_number, self.lab_dim + self.feat_dim))

            merged_features[0:lab_frame_number, 0:self.lab_dim] = lab_features
            merged_features[0:lab_frame_number, self.lab_dim:self.lab_dim +
                            self.feat_dim] = new_features[0:lab_frame_number, ]

            io_funcs.array_to_binary_file(merged_features, out_feat_file_name)
Exemple #2
0
    def compute_global_variance(self, file_list, feat_dim, save_dir):
	logger = logging.getLogger("compute gv")
	logger.info('computed global variance of length %d')

	all_std_vector = numpy.zeros((len(file_list), feat_dim))

	filenum = 0
	io_funcs = BinaryIOCollection()
	for file_name in file_list:
	    features = io_funcs.load_binary_file(file_name, feat_dim)
	    std_vector = numpy.var(features, axis=0)
	    all_std_vector[filenum, :] = std_vector
	    filenum = filenum + 1

#compute mean and std for all_std_vectors
	print all_std_vector.shape
	global_mean = numpy.mean(all_std_vector, axis=0)
	global_var = numpy.var(all_std_vector, axis=0)

        gv_mean_name = os.path.join(save_dir, 'gv_mean')
	fid = open(gv_mean_name, 'wb')
	global_mean.tofile(fid)
	fid.close()

        gv_var_name = os.path.join(save_dir, 'gv_var')
	fid = open(gv_var_name, 'wb')
	global_var.tofile(fid)
	fid.close()

        print global_mean
        print global_var
    def extract_durational_features(self, dur_file_name=None, dur_data=None):

        if dur_file_name:
            io_funcs = BinaryIOCollection()
            dur_dim = 1  ## hard coded for now
            dur_data = io_funcs.load_binary_file(dur_file_name, dur_dim)

        ph_count = len(dur_data)
        total_num_of_frames = int(sum(dur_data))

        duration_feature_array = numpy.zeros(
            (total_num_of_frames, self.frame_feature_size))

        frame_index = 0
        for i in range(ph_count):
            frame_number = int(dur_data[i])
            if self.subphone_feats == "coarse_coding":
                cc_feat_matrix = self.extract_coarse_coding_features_relative(
                    frame_number)

                for j in range(frame_number):
                    duration_feature_array[frame_index, 0] = cc_feat_matrix[j,
                                                                            0]
                    duration_feature_array[frame_index, 1] = cc_feat_matrix[j,
                                                                            1]
                    duration_feature_array[frame_index, 2] = cc_feat_matrix[j,
                                                                            2]
                    duration_feature_array[frame_index,
                                           3] = float(frame_number)
                    frame_index += 1

        return duration_feature_array
Exemple #4
0
    def compute_mean(self, file_list):

        logger = logging.getLogger("acoustic_norm")

        mean_vector = numpy.zeros((1, self.feature_dimension))
        all_frame_number = 0

        io_funcs = BinaryIOCollection()
        for file_name in file_list:
            features = io_funcs.load_binary_file(file_name,
                                                 self.feature_dimension)
            current_frame_number = features.size // self.feature_dimension
            mean_vector += numpy.reshape(numpy.sum(features, axis=0),
                                         (1, self.feature_dimension))
            all_frame_number += current_frame_number

        mean_vector /= float(all_frame_number)

        # po=numpy.get_printoptions()
        # numpy.set_printoptions(precision=2, threshold=20, linewidth=1000, edgeitems=4)
        logger.info('computed mean vector of length %d :' %
                    mean_vector.shape[1])
        logger.info(' mean: %s' % mean_vector)
        # restore the print options
        # numpy.set_printoptions(po)

        return mean_vector
Exemple #5
0
    def find_min_max_values(self, in_file_list, start_index, end_index):

        local_feature_dimension = end_index - start_index

        file_number = len(in_file_list)
        min_value_matrix = numpy.zeros((file_number, local_feature_dimension))
        max_value_matrix = numpy.zeros((file_number, local_feature_dimension))
        io_funcs = BinaryIOCollection()
        for i in range(file_number):
            features = io_funcs.load_binary_file(in_file_list[i],
                                                 self.feature_dimension)

            temp_min = numpy.amin(features[:, start_index:end_index], axis=0)
            temp_max = numpy.amax(features[:, start_index:end_index], axis=0)

            min_value_matrix[i, ] = temp_min
            max_value_matrix[i, ] = temp_max

        self.min_vector = numpy.amin(min_value_matrix, axis=0)
        self.max_vector = numpy.amax(max_value_matrix, axis=0)
        self.min_vector = numpy.reshape(self.min_vector,
                                        (1, local_feature_dimension))
        self.max_vector = numpy.reshape(self.max_vector,
                                        (1, local_feature_dimension))

        # po=numpy.get_printoptions()
        # numpy.set_printoptions(precision=2, threshold=20, linewidth=1000, edgeitems=4)
        self.logger.info('found min/max values of length %d:' %
                         local_feature_dimension)
        self.logger.info('  min: %s' % self.min_vector)
        self.logger.info('  max: %s' % self.max_vector)
Exemple #6
0
    def find_min_max_values(self, in_file_list):

        logger = logging.getLogger("acoustic_norm")

        file_number = len(in_file_list)
        min_value_matrix = numpy.zeros((file_number, self.feature_dimension))
        max_value_matrix = numpy.zeros((file_number, self.feature_dimension))
        io_funcs = BinaryIOCollection()
        for i in range(file_number):
            features = io_funcs.load_binary_file(in_file_list[i],
                                                 self.feature_dimension)

            temp_min = numpy.amin(features, axis=0)
            temp_max = numpy.amax(features, axis=0)

            min_value_matrix[i, ] = temp_min
            max_value_matrix[i, ] = temp_max

        self.min_vector = numpy.amin(min_value_matrix, axis=0)
        self.max_vector = numpy.amax(max_value_matrix, axis=0)
        self.min_vector = numpy.reshape(self.min_vector,
                                        (1, self.feature_dimension))
        self.max_vector = numpy.reshape(self.max_vector,
                                        (1, self.feature_dimension))

        # po=numpy.get_printoptions()
        # numpy.set_printoptions(precision=2, threshold=20, linewidth=1000, edgeitems=4)
        logger.info('across %d files found min/max values of length %d:' %
                    (file_number, self.feature_dimension))
        logger.info('  min: %s' % self.min_vector)
        logger.info('  max: %s' % self.max_vector)
    def compute_std(self, file_list, mean_vector):

        logger = logging.getLogger("acoustic_norm")

        std_vector = numpy.zeros((1, self.feature_dimension))
        all_frame_number = 0

        io_funcs = BinaryIOCollection()
        for file_name in file_list:
            features = io_funcs.load_binary_file(file_name, self.feature_dimension)
            current_frame_number = features.size // self.feature_dimension
            mean_matrix = numpy.tile(mean_vector, (current_frame_number, 1))

            std_vector += numpy.reshape(numpy.sum((features - mean_matrix) ** 2, axis=0), (1, self.feature_dimension))
            all_frame_number += current_frame_number

        std_vector /= float(all_frame_number)

        std_vector = std_vector ** 0.5

        # po=numpy.get_printoptions()
        # numpy.set_printoptions(precision=2, threshold=20, linewidth=1000, edgeitems=4)
        logger.info('computed  std vector of length %d' % std_vector.shape[1] )
        logger.info('  std: %s' % std_vector)
        # restore the print options
        # numpy.set_printoptions(po)

        return  std_vector
    def find_min_max_values(self, in_file_list):

        logger = logging.getLogger("acoustic_norm")

        file_number = len(in_file_list)
        min_value_matrix = numpy.zeros((file_number, self.feature_dimension))
        max_value_matrix = numpy.zeros((file_number, self.feature_dimension))
        io_funcs = BinaryIOCollection()
        for i in range(file_number):
            features = io_funcs.load_binary_file(in_file_list[i], self.feature_dimension)

            temp_min = numpy.amin(features, axis = 0)
            temp_max = numpy.amax(features, axis = 0)

            min_value_matrix[i, ] = temp_min;
            max_value_matrix[i, ] = temp_max;

        self.min_vector = numpy.amin(min_value_matrix, axis = 0)
        self.max_vector = numpy.amax(max_value_matrix, axis = 0)
        self.min_vector = numpy.reshape(self.min_vector, (1, self.feature_dimension))
        self.max_vector = numpy.reshape(self.max_vector, (1, self.feature_dimension))

        # po=numpy.get_printoptions()
        # numpy.set_printoptions(precision=2, threshold=20, linewidth=1000, edgeitems=4)
        logger.info('across %d files found min/max values of length %d:' % (file_number,self.feature_dimension) )
        logger.info('  min: %s' % self.min_vector)
        logger.info('  max: %s' % self.max_vector)
    def denormalise_data(self, in_file_list, out_file_list):

        logger = logging.getLogger("acoustic_norm")

        file_number = len(in_file_list)
        logger.info('MinMaxNormalisation.denormalise_data for %d files' % file_number)

        # print   self.max_vector, self.min_vector
        fea_max_min_diff = self.max_vector - self.min_vector
        diff_value = self.target_max_value - self.target_min_value
        # logger.debug('reshaping fea_max_min_diff from shape %s to (1,%d)' % (fea_max_min_diff.shape, self.feature_dimension) )

        fea_max_min_diff = numpy.reshape(fea_max_min_diff, (1, self.feature_dimension))

        target_max_min_diff = numpy.zeros((1, self.feature_dimension))
        target_max_min_diff.fill(diff_value)

        target_max_min_diff[fea_max_min_diff <= 0.0] = 1.0
        fea_max_min_diff[fea_max_min_diff <= 0.0] = 1.0

        io_funcs = BinaryIOCollection()
        for i in range(file_number):
            features = io_funcs.load_binary_file(in_file_list[i], self.feature_dimension)

            frame_number = features.size // self.feature_dimension
            fea_min_matrix = numpy.tile(self.min_vector, (frame_number, 1))
            target_min_matrix = numpy.tile(self.target_min_value, (frame_number, self.feature_dimension))

            fea_diff_matrix = numpy.tile(fea_max_min_diff, (frame_number, 1))
            diff_norm_matrix = fea_diff_matrix / numpy.tile(target_max_min_diff, (frame_number, 1))
            norm_features = diff_norm_matrix * (features - target_min_matrix) + fea_min_matrix
            io_funcs.array_to_binary_file(norm_features, out_file_list[i])
Exemple #10
0
    def compute_std(self, file_list, mean_vector):

        logger = logging.getLogger("acoustic_norm")

        std_vector = numpy.zeros((1, self.feature_dimension))
        all_frame_number = 0

        io_funcs = BinaryIOCollection()
        for file_name in file_list:
            features = io_funcs.load_binary_file(file_name,
                                                 self.feature_dimension)
            current_frame_number = features.size // self.feature_dimension
            mean_matrix = numpy.tile(mean_vector, (current_frame_number, 1))

            std_vector += numpy.reshape(
                numpy.sum((features - mean_matrix)**2, axis=0),
                (1, self.feature_dimension))
            all_frame_number += current_frame_number

        std_vector /= float(all_frame_number)

        std_vector = std_vector**0.5

        # po=numpy.get_printoptions()
        # numpy.set_printoptions(precision=2, threshold=20, linewidth=1000, edgeitems=4)
        logger.info('computed  std vector of length %d' % std_vector.shape[1])
        logger.info('  std: %s' % std_vector)
        # restore the print options
        # numpy.set_printoptions(po)

        return std_vector
Exemple #11
0
    def normalise_data(self, in_file_list, out_file_list):
        file_number = len(in_file_list)

        fea_max_min_diff = self.max_vector - self.min_vector
        diff_value = self.target_max_value - self.target_min_value
        fea_max_min_diff = numpy.reshape(fea_max_min_diff, (1, self.feature_dimension))

        target_max_min_diff = numpy.zeros((1, self.feature_dimension))
        target_max_min_diff.fill(diff_value)

        target_max_min_diff[fea_max_min_diff <= 0.0] = 1.0
        fea_max_min_diff[fea_max_min_diff <= 0.0] = 1.0

        io_funcs = BinaryIOCollection()
        for i in range(file_number):
            features = io_funcs.load_binary_file(in_file_list[i], self.feature_dimension)

            frame_number = features.size // self.feature_dimension
            fea_min_matrix = numpy.tile(self.min_vector, (frame_number, 1))
            target_min_matrix = numpy.tile(self.target_min_value, (frame_number, self.feature_dimension))

            fea_diff_matrix = numpy.tile(fea_max_min_diff, (frame_number, 1))
            diff_norm_matrix = numpy.tile(target_max_min_diff, (frame_number, 1)) / fea_diff_matrix

            norm_features = diff_norm_matrix * (features - fea_min_matrix) + target_min_matrix

            ## If we are to keep some columns unnormalised, use advanced indexing to
            ## reinstate original values:
            m,n = numpy.shape(features)
            for col in self.exclude_columns:
                norm_features[list(range(m)),[col]*m] = features[list(range(m)),[col]*m]

            io_funcs.array_to_binary_file(norm_features, out_file_list[i])
    def find_min_max_values(self, in_file_list, start_index, end_index):

        local_feature_dimension = end_index - start_index

        file_number = len(in_file_list)
        min_value_matrix = numpy.zeros((file_number, local_feature_dimension))
        max_value_matrix = numpy.zeros((file_number, local_feature_dimension))
        io_funcs = BinaryIOCollection()
        for i in xrange(file_number):
            features = io_funcs.load_binary_file(in_file_list[i], self.feature_dimension)
            
            temp_min = numpy.amin(features[:, start_index:end_index], axis = 0)
            temp_max = numpy.amax(features[:, start_index:end_index], axis = 0)
            
            min_value_matrix[i, ] = temp_min;
            max_value_matrix[i, ] = temp_max;

        self.min_vector = numpy.amin(min_value_matrix, axis = 0)
        self.max_vector = numpy.amax(max_value_matrix, axis = 0)
        self.min_vector = numpy.reshape(self.min_vector, (1, local_feature_dimension))
        self.max_vector = numpy.reshape(self.max_vector, (1, local_feature_dimension))

        # po=numpy.get_printoptions()
        # numpy.set_printoptions(precision=2, threshold=20, linewidth=1000, edgeitems=4)
        self.logger.info('found min/max values of length %d:' % local_feature_dimension)
        self.logger.info('  min: %s' % self.min_vector)
        self.logger.info('  max: %s' % self.max_vector)
    def extract_durational_features(self, dur_file_name=None, dur_data=None):

        if dur_file_name:
            io_funcs = BinaryIOCollection()
            dur_dim = 1 ## hard coded for now
            dur_data = io_funcs.load_binary_file(dur_file_name, dur_dim)

        ph_count = len(dur_data)
        total_num_of_frames = int(sum(dur_data))

        duration_feature_array = numpy.zeros((total_num_of_frames, self.frame_feature_size))

        frame_index=0
        for i in range(ph_count):
            frame_number = int(dur_data[i])
            if self.subphone_feats == "coarse_coding":
                cc_feat_matrix = self.extract_coarse_coding_features_relative(frame_number)

                for j in range(frame_number):
                    duration_feature_array[frame_index, 0] = cc_feat_matrix[j, 0]
                    duration_feature_array[frame_index, 1] = cc_feat_matrix[j, 1]
                    duration_feature_array[frame_index, 2] = cc_feat_matrix[j, 2]
                    duration_feature_array[frame_index, 3] = float(frame_number)
                    frame_index+=1

            elif self.subphone_feats == 'full':
                state_number = 5 # hard coded here 
                phone_duration = sum(dur_data[i, :])
                state_duration_base = 0
                for state_index in xrange(1, state_number+1):
                    state_index_backward = (state_number - state_index) + 1
                    frame_number = int(dur_data[i][state_index-1])
                    for j in xrange(frame_number):
                        duration_feature_array[frame_index, 0] = float(j+1) / float(frame_number)   ## fraction through state (forwards)
                        duration_feature_array[frame_index, 1] = float(frame_number - j) / float(frame_number)  ## fraction through state (backwards)
                        duration_feature_array[frame_index, 2] = float(frame_number)  ## length of state in frames
                        duration_feature_array[frame_index, 3] = float(state_index)   ## state index (counting forwards)
                        duration_feature_array[frame_index, 4] = float(state_index_backward) ## state index (counting backwards)
    
                        duration_feature_array[frame_index, 5] = float(phone_duration)   ## length of phone in frames
                        duration_feature_array[frame_index, 6] = float(frame_number) / float(phone_duration)   ## fraction of the phone made up by current state
                        duration_feature_array[frame_index, 7] = float(phone_duration - j - state_duration_base) / float(phone_duration) ## fraction through phone (forwards)
                        duration_feature_array[frame_index, 8] = float(state_duration_base + j + 1) / float(phone_duration)  ## fraction through phone (backwards)
                        frame_index+=1
                    
                    state_duration_base += frame_number

        return duration_feature_array
    def extract_durational_features(self, dur_file_name=None, dur_data=None):

        if dur_file_name:
            io_funcs = BinaryIOCollection()
            dur_dim = 1 ## hard coded for now
            dur_data = io_funcs.load_binary_file(dur_file_name, dur_dim)

        ph_count = len(dur_data)
        total_num_of_frames = int(sum(dur_data))

        duration_feature_array = numpy.zeros((total_num_of_frames, self.frame_feature_size))

        frame_index=0
        for i in range(ph_count):
            frame_number = int(dur_data[i])
            if self.subphone_feats == "coarse_coding":
                cc_feat_matrix = self.extract_coarse_coding_features_relative(frame_number)

                for j in range(frame_number):
                    duration_feature_array[frame_index, 0] = cc_feat_matrix[j, 0]
                    duration_feature_array[frame_index, 1] = cc_feat_matrix[j, 1]
                    duration_feature_array[frame_index, 2] = cc_feat_matrix[j, 2]
                    duration_feature_array[frame_index, 3] = float(frame_number)
                    frame_index+=1

            elif self.subphone_feats == 'full':
                state_number = 5 # hard coded here 
                phone_duration = sum(dur_data[i, :])
                state_duration_base = 0
                for state_index in xrange(1, state_number+1):
                    state_index_backward = (state_number - state_index) + 1
                    frame_number = int(dur_data[i][state_index-1])
                    for j in xrange(frame_number):
                        duration_feature_array[frame_index, 0] = float(j+1) / float(frame_number)   ## fraction through state (forwards)
                        duration_feature_array[frame_index, 1] = float(frame_number - j) / float(frame_number)  ## fraction through state (backwards)
                        duration_feature_array[frame_index, 2] = float(frame_number)  ## length of state in frames
                        duration_feature_array[frame_index, 3] = float(state_index)   ## state index (counting forwards)
                        duration_feature_array[frame_index, 4] = float(state_index_backward) ## state index (counting backwards)
    
                        duration_feature_array[frame_index, 5] = float(phone_duration)   ## length of phone in frames
                        duration_feature_array[frame_index, 6] = float(frame_number) / float(phone_duration)   ## fraction of the phone made up by current state
                        duration_feature_array[frame_index, 7] = float(phone_duration - j - state_duration_base) / float(phone_duration) ## fraction through phone (forwards)
                        duration_feature_array[frame_index, 8] = float(state_duration_base + j + 1) / float(phone_duration)  ## fraction through phone (backwards)
                        frame_index+=1
                    
                    state_duration_base += frame_number

        return duration_feature_array
Exemple #15
0
    def load_phone_alignment(self, alignment_file_name, dur_file_name=None):

        if dur_file_name:
            io_funcs = BinaryIOCollection()
            dur_dim = 1  ## hard coded for now
            manual_dur_data = io_funcs.load_binary_file(dur_file_name, dur_dim)

        ph_count = 0
        base_frame_index = 0
        nonsilence_frame_index_list = []
        fid = open(alignment_file_name)
        for line in fid.readlines():
            line = line.strip()
            if len(line) < 1:
                continue
            temp_list = re.split('\s+', line)

            if len(temp_list) == 1:
                full_label = temp_list[0]
            else:
                start_time = int(temp_list[0])
                end_time = int(temp_list[1])
                full_label = temp_list[2]

                # to do - support different frame shift - currently hardwired to 5msec
                # currently under beta testing: supports different frame shift
                if dur_file_name:
                    frame_number = manual_dur_data[ph_count]
                    ph_count = ph_count + 1
                else:
                    frame_number = int((end_time - start_time) / 50000)

            label_binary_flag = self.check_silence_pattern(full_label)

            if self.remove_frame_features:
                if label_binary_flag == 0:
                    for frame_index in range(frame_number):
                        nonsilence_frame_index_list.append(base_frame_index +
                                                           frame_index)
                base_frame_index = base_frame_index + frame_number
            elif self.subphone_feats == 'none':
                if label_binary_flag == 0:
                    nonsilence_frame_index_list.append(base_frame_index)
                base_frame_index = base_frame_index + 1

        fid.close()

        return nonsilence_frame_index_list
Exemple #16
0
    def normalise_data(self, in_file_list, out_file_list):
        file_number = len(in_file_list)

        fea_max_min_diff = self.max_vector - self.min_vector
        diff_value = self.target_max_value - self.target_min_value
        fea_max_min_diff = numpy.reshape(fea_max_min_diff,
                                         (1, self.feature_dimension))

        target_max_min_diff = numpy.zeros((1, self.feature_dimension))
        target_max_min_diff.fill(diff_value)

        target_max_min_diff[fea_max_min_diff <= 0.0] = 1.0
        fea_max_min_diff[fea_max_min_diff <= 0.0] = 1.0

        io_funcs = BinaryIOCollection()
        for i in xrange(file_number):
            features = io_funcs.load_binary_file(in_file_list[i],
                                                 self.feature_dimension)

            frame_number = features.size / self.feature_dimension
            fea_min_matrix = numpy.tile(self.min_vector, (frame_number, 1))
            fea_max_matrix = numpy.tile(self.max_vector, (frame_number, 1))

            for m in xrange(features.shape[0]):
                for n in xrange(features.shape[1]):
                    if features[m][n] < fea_min_matrix[m][n]:
                        features[m][n] = fea_min_matrix[m][n]
                    elif features[m][n] > fea_max_matrix[m][n]:
                        features[m][n] = fea_max_matrix[m][n]

            target_min_matrix = numpy.tile(
                self.target_min_value, (frame_number, self.feature_dimension))

            fea_diff_matrix = numpy.tile(fea_max_min_diff, (frame_number, 1))
            diff_norm_matrix = numpy.tile(target_max_min_diff,
                                          (frame_number, 1)) / fea_diff_matrix

            norm_features = diff_norm_matrix * (
                features - fea_min_matrix) + target_min_matrix

            ## If we are to keep some columns unnormalised, use advanced indexing to
            ## reinstate original values:
            m, n = numpy.shape(features)
            for col in self.exclude_columns:
                norm_features[range(m), [col] * m] = features[range(m),
                                                              [col] * m]

            io_funcs.array_to_binary_file(norm_features, out_file_list[i])
Exemple #17
0
    def normal_standardization(self, in_file_list, out_file_list):
        mean_vector = self.compute_mean(in_file_list)
        std_vector = self.compute_std(in_file_list, mean_vector)

        io_funcs = BinaryIOCollection()
        file_number = len(in_file_list)
        for i in range(file_number):
            features = io_funcs.load_binary_file(in_file_list[i], self.feature_dimension)
            current_frame_number = features.size // self.feature_dimension

            mean_matrix = numpy.tile(mean_vector, (current_frame_number, 1))
            std_matrix = numpy.tile(std_vector, (current_frame_number, 1))

            norm_features = (features - mean_matrix) / std_matrix

            io_funcs.array_to_binary_file(norm_features, out_file_list[i])
    def load_phone_alignment(self, alignment_file_name, dur_file_name=None):

        if dur_file_name:
            io_funcs = BinaryIOCollection()
            dur_dim = 1  ## hard coded for now
            manual_dur_data = io_funcs.load_binary_file(dur_file_name, dur_dim)

        ph_count = 0
        base_frame_index = 0
        nonsilence_frame_index_list = []
        fid = open(alignment_file_name)
        for line in fid.readlines():
            line = line.strip()
            if len(line) < 1:
                continue
            temp_list = re.split('\s+', line)

            if len(temp_list) == 1:
                full_label = temp_list[0]
            else:
                start_time = int(temp_list[0])
                end_time = int(temp_list[1])
                full_label = temp_list[2]

                # to do - support different frame shift - currently hardwired to 5msec
                # currently under beta testing: supports different frame shift
                if dur_file_name:
                    frame_number = manual_dur_data[ph_count]
                    ph_count = ph_count + 1
                else:
                    frame_number = int((end_time - start_time) / 50000)

            label_binary_flag = self.check_silence_pattern(full_label)

            if self.remove_frame_features:
                if label_binary_flag == 0:
                    for frame_index in range(frame_number):
                        nonsilence_frame_index_list.append(base_frame_index + frame_index)
                base_frame_index = base_frame_index + frame_number
            elif self.subphone_feats == 'none':
                if label_binary_flag == 0:
                    nonsilence_frame_index_list.append(base_frame_index)
                base_frame_index = base_frame_index + 1

        fid.close()

        return nonsilence_frame_index_list
Exemple #19
0
    def normal_standardization(self, in_file_list, out_file_list):
        mean_vector = self.compute_mean(in_file_list)
        std_vector = self.compute_std(in_file_list, mean_vector)

        io_funcs = BinaryIOCollection()
        file_number = len(in_file_list)
        for i in range(file_number):
            features = io_funcs.load_binary_file(in_file_list[i],
                                                 self.feature_dimension)
            current_frame_number = features.size // self.feature_dimension

            mean_matrix = numpy.tile(mean_vector, (current_frame_number, 1))
            std_matrix = numpy.tile(std_vector, (current_frame_number, 1))

            norm_features = old_div((features - mean_matrix), std_matrix)

            io_funcs.array_to_binary_file(norm_features, out_file_list[i])
    def remove_silence(self,
                       in_data_list,
                       in_align_list,
                       out_data_list,
                       dur_file_list=None):
        file_number = len(in_data_list)
        align_file_number = len(in_align_list)

        if file_number != align_file_number:
            print "The number of input and output files does not equal!\n"
            sys.exit(1)
        if file_number != len(out_data_list):
            print "The number of input and output files does not equal!\n"
            sys.exit(1)

        io_funcs = BinaryIOCollection()
        for i in xrange(file_number):

            if self.label_type == "phone_align":
                if dur_file_list:
                    dur_file_name = dur_file_list[i]
                else:
                    dur_file_name = None
                nonsilence_indices = self.load_phone_alignment(
                    in_align_list[i], dur_file_name)
            else:
                nonsilence_indices = self.load_alignment(in_align_list[i])

            ori_cmp_data = io_funcs.load_binary_file(in_data_list[i],
                                                     self.n_cmp)

            frame_number = ori_cmp_data.size / self.n_cmp

            if len(nonsilence_indices) == frame_number:
                print 'WARNING: no silence found!'
                # previsouly: continue -- in fact we should keep non-silent data!

            ## if labels have a few extra frames than audio, this can break the indexing, remove them:
            nonsilence_indices = [
                ix for ix in nonsilence_indices if ix < frame_number
            ]

            new_cmp_data = ori_cmp_data[nonsilence_indices, ]

            io_funcs.array_to_binary_file(new_cmp_data, out_data_list[i])
Exemple #21
0
    def denormalise_data(self, in_file_list, out_file_list):

        logger = logging.getLogger("acoustic_norm")

        file_number = len(in_file_list)
        logger.info('MinMaxNormalisation.denormalise_data for %d files' %
                    file_number)

        # print   self.max_vector, self.min_vector
        fea_max_min_diff = self.max_vector - self.min_vector
        diff_value = self.target_max_value - self.target_min_value
        # logger.debug('reshaping fea_max_min_diff from shape %s to (1,%d)' % (fea_max_min_diff.shape, self.feature_dimension) )

        fea_max_min_diff = numpy.reshape(fea_max_min_diff,
                                         (1, self.feature_dimension))

        target_max_min_diff = numpy.zeros((1, self.feature_dimension))
        target_max_min_diff.fill(diff_value)

        target_max_min_diff[fea_max_min_diff <= 0.0] = 1.0
        fea_max_min_diff[fea_max_min_diff <= 0.0] = 1.0

        io_funcs = BinaryIOCollection()
        for i in range(file_number):
            features = io_funcs.load_binary_file(in_file_list[i],
                                                 self.feature_dimension)

            frame_number = features.size // self.feature_dimension
            fea_min_matrix = numpy.tile(self.min_vector, (frame_number, 1))
            target_min_matrix = numpy.tile(
                self.target_min_value, (frame_number, self.feature_dimension))

            fea_diff_matrix = numpy.tile(fea_max_min_diff, (frame_number, 1))
            diff_norm_matrix = old_div(
                fea_diff_matrix,
                numpy.tile(target_max_min_diff, (frame_number, 1)))
            norm_features = diff_norm_matrix * (
                features - target_min_matrix) + fea_min_matrix
            io_funcs.array_to_binary_file(norm_features, out_file_list[i])
Exemple #22
0
    def remove_silence(self, in_data_list, in_align_list, out_data_list, dur_file_list=None):
        file_number = len(in_data_list)
        align_file_number = len(in_align_list)

        if  file_number != align_file_number:
            print   "The number of input and output files does not equal!\n"
            sys.exit(1)
        if  file_number != len(out_data_list):
            print   "The number of input and output files does not equal!\n"
            sys.exit(1)

        io_funcs = BinaryIOCollection()
        for i in xrange(file_number):

            if self.label_type=="phone_align":
                if dur_file_list:
                    dur_file_name = dur_file_list[i]
                else:
                    dur_file_name = None
                nonsilence_indices = self.load_phone_alignment(in_align_list[i], dur_file_name)
            else:
                nonsilence_indices = self.load_alignment(in_align_list[i])

            ori_cmp_data = io_funcs.load_binary_file(in_data_list[i], self.n_cmp)
            
            frame_number = ori_cmp_data.size/self.n_cmp
            
            if len(nonsilence_indices) == frame_number:
                print 'WARNING: no silence found!'
                # previsouly: continue -- in fact we should keep non-silent data!

            ## if labels have a few extra frames than audio, this can break the indexing, remove them:
            nonsilence_indices = [ix for ix in nonsilence_indices if ix < frame_number]

            new_cmp_data = ori_cmp_data[nonsilence_indices,]

            io_funcs.array_to_binary_file(new_cmp_data, out_data_list[i])
Exemple #23
0
    def compute_mean(self, file_list):

        logger = logging.getLogger("acoustic_norm")

        mean_vector = numpy.zeros((1, self.feature_dimension))
        all_frame_number = 0

        io_funcs = BinaryIOCollection()
        for file_name in file_list:
            features = io_funcs.load_binary_file(file_name, self.feature_dimension)
            current_frame_number = features.size // self.feature_dimension
            mean_vector += numpy.reshape(numpy.sum(features, axis=0), (1, self.feature_dimension))
            all_frame_number += current_frame_number

        mean_vector /= float(all_frame_number)

        # po=numpy.get_printoptions()
        # numpy.set_printoptions(precision=2, threshold=20, linewidth=1000, edgeitems=4)
        logger.info('computed mean vector of length %d :' % mean_vector.shape[1] )
        logger.info(' mean: %s' % mean_vector)
        # restore the print options
        # numpy.set_printoptions(po)

        return  mean_vector
Exemple #24
0
    def extract_base_features(self, feat_dir_path, feat_switch, list_of_files,
                              decomposition_unit):
        ### load Binary module ###
        io_funcs = BinaryIOCollection()
        htsclass = readHTSlabelFile()

        ### read file by file ###
        for i in range(len(list_of_files)):
            filename = list_of_files[i]
            print filename

            binary_label_dir = feat_dir_path['input_binary']
            label_align_dir = feat_dir_path['input_labfile']
            txt_dir = feat_dir_path['input_txt']
            out_feat_dir = feat_dir_path['output_feat']

            in_filename = os.path.join(binary_label_dir, filename + '.lab')
            in_lab_file = os.path.join(label_align_dir, filename + '.lab')
            in_txt_file = os.path.join(txt_dir, filename + '.txt')
            out_filename = os.path.join(out_feat_dir, filename + '.lab')

            ip1 = open(in_txt_file, 'r')
            text_Data = ip1.readlines()
            ip1.close()

            list_of_words = text_Data[0].split()

            [phone, ph_arr,
             mean_f0_arr] = htsclass.read_state_align_label_file(in_lab_file)

            features = io_funcs.load_binary_file(in_filename, 1)

            file_len = len(phone)

            op1 = open(out_filename, 'w')
            count = 0
            frame_count = 0
            phone_count = 0
            wc = 0
            seg_count = 0

            feat_arr = []
            prev_feat_arr = []
            syl_identity = self.zeros(300, 1)
            syl = ''
            phinsyl = 0
            for j in range(len(features)):
                count = count + 1

                if (count == 601):
                    count = 0
                    feat_arr = []
                    sil_flag = 0
                    continue

                if (count == 59 and int(features[j]) == 1):
                    sil_flag = 1
                if (count == 148 and int(features[j]) == 1):
                    sil_flag = 0

                if (count <= 348 or (count >= 406 and count <= 421)
                        or count > 592):
                    continue

                feat_arr.append(int(features[j]))

                if (count == 592):

                    if np.abs(frame_count - int(ph_arr[1][phone_count] *
                                                (10**-4) / 5)) <= 1:
                        ph_identity = features[j - 492:j - 443]
                        ph_identity = np.reshape(ph_identity, len(ph_identity),
                                                 -1)
                        syl_identity[phinsyl * 50:(phinsyl + 1) * 50 -
                                     1] = ph_identity
                        syl = syl + phone[phone_count]
                        if phone[phone_count] == '#':
                            syl_identity[(phinsyl + 1) * 50 - 1] = 1
                        phinsyl += 1
                        phone_count += 1

                    frame_count += 1

                    if (len(prev_feat_arr) != 0 and prev_feat_arr == feat_arr):
                        continue
                    else:
                        prev_feat_arr = feat_arr

                        if (syl != '#' and syl != ''):
                            syl_vec = ''
                            new_syl_identity = [
                                0.99 if x == 1 else 0.01 for x in syl_identity
                            ]
                            for x in range(len(new_syl_identity)):
                                syl_vec = syl_vec + str(
                                    new_syl_identity[x]) + ' '
                            op1.write(syl_vec + '\n')

                        ### reset syllable information ###
                        phinsyl = 0
                        syl = ''
                        syl_identity = self.zeros(300, 1)

                        if (sil_flag == 1):
                            continue
                        seg_count += 1
                        new_arr = [
                            0.99 if x == 1 else 0.01 for x in prev_feat_arr
                        ]
                        for item in new_arr:
                            op1.write("%s " % item)

                        ### word ending information ###
                        if (mean_f0_arr[phone_count][5] -
                                mean_f0_arr[phone_count - 1][5] != 0
                                and phone[phone_count] != 'pau'):
                            wc += 1
                        word = list_of_words[wc - 1]
                        if word in self.wrd_embeds:
                            word_vec = self.wrd_embeds[word]
                        else:
                            word_vec = self.wrd_embeds['*UNKNOWN*']
                        if (phone[phone_count] == 'pau'):
                            word_vec = self.wrd_embeds['*UNKNOWN*']
                        op1.write(word_vec + ' ')
                        continue
            op1.close()
Exemple #25
0
def trim_silence(in_list, out_list, in_dimension, label_list, label_dimension, \
                 silence_feature_index, percent_to_keep=0):
    '''
    Function to trim silence from binary label/speech files based on binary labels.
        in_list: list of binary label/speech files to trim
        out_list: trimmed files
        in_dimension: dimension of data to trim
        label_list: list of binary labels which contain trimming criterion
        label_dimesion:
        silence_feature_index: index of feature in labels which is silence: 1 means silence (trim), 0 means leave.
    '''
    assert len(in_list) == len(out_list) == len(label_list)
    io_funcs = BinaryIOCollection()
    for (infile, outfile, label_file) in zip(in_list, out_list, label_list):

        data = io_funcs.load_binary_file(infile, in_dimension)
        label = io_funcs.load_binary_file(label_file, label_dimension)

        audio_label_difference = data.shape[0] - label.shape[0]
        assert math.fabs(
            audio_label_difference
        ) < 3, '%s and %s contain different numbers of frames: %s %s' % (
            infile, label_file, data.shape[0], label.shape[0])

        ## In case they are different, resize -- keep label fixed as we assume this has
        ## already been processed. (This problem only arose with STRAIGHT features.)
        if audio_label_difference < 0:  ## label is longer -- pad audio to match by repeating last frame:
            print('audio too short -- pad')
            padding = numpy.vstack([data[-1, :]] *
                                   int(math.fabs(audio_label_difference)))
            data = numpy.vstack([data, padding])
        elif audio_label_difference > 0:  ## audio is longer -- cut it
            print('audio too long -- trim')
            new_length = label.shape[0]
            data = data[:new_length, :]
        # else: -- expected case -- lengths match, so do nothing

        silence_flag = label[:, silence_feature_index]
        #         print silence_flag
        if not (numpy.unique(silence_flag) == numpy.array([0, 1])).all():
            ## if it's all 0s or 1s, that's ok:
            assert (numpy.unique(silence_flag) == numpy.array([0]).all()) or \
                   (numpy.unique(silence_flag) == numpy.array([1]).all()), \
                'dimension %s of %s contains values other than 0 and 1' % (silence_feature_index, infile)
        print('Remove %d%% of frames (%s frames) as silence... ' %
              (100 * numpy.sum(silence_flag / float(len(silence_flag))),
               int(numpy.sum(silence_flag))))
        non_silence_indices = numpy.nonzero(
            silence_flag ==
            0)  ## get the indices where silence_flag == 0 is True (i.e. != 0)
        if percent_to_keep != 0:
            assert type(percent_to_keep) == int and percent_to_keep > 0
            # print silence_flag
            silence_indices = numpy.nonzero(silence_flag == 1)
            ## nonzero returns a tuple of arrays, one for each dimension of input array
            silence_indices = silence_indices[0]
            every_nth = 100 / percent_to_keep
            silence_indices_to_keep = silence_indices[::
                                                      every_nth]  ## every_nth used +as step value in slice
            ## -1 due to weird error with STRAIGHT features at line 144:
            ## IndexError: index 445 is out of bounds for axis 0 with size 445
            if len(silence_indices_to_keep) == 0:
                silence_indices_to_keep = numpy.array(
                    [1])  ## avoid errors in case there is no silence
            print(
                '   Restore %s%% (every %sth frame: %s frames) of silent frames'
                % (percent_to_keep, every_nth, len(silence_indices_to_keep)))

            ## Append to end of utt -- same function used for labels and audio
            ## means that violation of temporal order doesn't matter -- will be consistent.
            ## Later, frame shuffling will disperse silent frames evenly across minibatches:
            non_silence_indices = (numpy.hstack(
                [non_silence_indices[0], silence_indices_to_keep]))
            ##  ^---- from tuple and back (see nonzero note above)

        trimmed_data = data[
            non_silence_indices, :]  ## advanced integer indexing
        io_funcs.array_to_binary_file(trimmed_data, outfile)
    def extract_base_features(self, feat_dir_path, feat_switch, list_of_files, decomposition_unit):
        ### load Binary module ###
        io_funcs = BinaryIOCollection()
        htsclass = readHTSlabelFile()
        
        ### read file by file ###
        for i in range(len(list_of_files)):    
            filename = list_of_files[i]     
            print filename
            
            binary_label_dir = feat_dir_path['input_binary']
            label_align_dir = feat_dir_path['input_labfile']
            txt_dir = feat_dir_path['input_txt']
            out_feat_dir = feat_dir_path['output_feat']
            
            in_filename = os.path.join(binary_label_dir, filename + '.lab');
            in_lab_file = os.path.join(label_align_dir, filename + '.lab')
            in_txt_file = os.path.join(txt_dir, filename + '.txt')
            out_filename = os.path.join(out_feat_dir, filename + '.lab');
            
            ip1 = open(in_txt_file, 'r')
            text_Data = ip1.readlines()
            ip1.close()
            
            list_of_words = text_Data[0].split()

            [phone, ph_arr, mean_f0_arr] = htsclass.read_state_align_label_file(in_lab_file)
            
            features = io_funcs.load_binary_file(in_filename, 1)
        
            file_len = len(phone)
            
            op1 = open(out_filename, 'w')
            count = 0; frame_count = 0; phone_count = 0;
            wc = 0; seg_count = 0;
        
            feat_arr = []
            prev_feat_arr = []
            syl_identity = self.zeros(300,1)
            syl = ''
            phinsyl = 0
            for j in range(len(features)):
                count = count + 1
        
                if(count == 601):
                    count = 0;
                    feat_arr = []
                    sil_flag = 0
                    continue;
        
                if (count == 59 and int(features[j]) == 1):
                    sil_flag = 1
                if (count == 148 and int(features[j]) == 1):
                    sil_flag = 0
        
                if(count <= 348 or (count >= 406 and count <= 421) or count > 592):
                    continue;
        
                feat_arr.append(int(features[j]))
        
                if(count == 592):
                    
                    if np.abs(frame_count - int(ph_arr[1][phone_count] * (10 ** -4) / 5)) <= 1:
                            ph_identity = features[j-492:j-443]
                            ph_identity = np.reshape(ph_identity, len(ph_identity), -1)
                            syl_identity[phinsyl*50:(phinsyl+1)*50-1] = ph_identity
                            syl = syl+phone[phone_count]
                            if phone[phone_count] == '#':
                                syl_identity[(phinsyl+1)*50-1] = 1
                            phinsyl += 1
                            phone_count += 1
                    
                    frame_count += 1
          
                    if(len(prev_feat_arr) != 0 and prev_feat_arr == feat_arr):
                        continue;
                    else:
                        prev_feat_arr = feat_arr
                        
                        if(syl!='#' and syl!=''):
                            syl_vec = ''
                            new_syl_identity = [0.99 if x==1 else 0.01 for x in syl_identity]
                            for x in range(len(new_syl_identity)):
                                syl_vec = syl_vec+str(new_syl_identity[x])+' '
                            op1.write(syl_vec+'\n')
                        
                        ### reset syllable information ###
                        phinsyl = 0; syl=''
                        syl_identity = self.zeros(300, 1)
                        
                        if (sil_flag == 1):
                            continue;
                        seg_count += 1
                        new_arr = [0.99 if x==1 else 0.01 for x in prev_feat_arr]
                        for item in new_arr:
                            op1.write("%s " % item)
                                
                        ### word ending information ###        
                        if(mean_f0_arr[phone_count][5] - mean_f0_arr[phone_count - 1][5] != 0 and phone[phone_count] != 'pau'):
                            wc += 1
                        word = list_of_words[wc - 1]    
                        if word in self.wrd_embeds:
                            word_vec = self.wrd_embeds[word]
                        else:
                            word_vec = self.wrd_embeds['*UNKNOWN*']
                        if(phone[phone_count] == 'pau'):
                            word_vec = self.wrd_embeds['*UNKNOWN*']
                        op1.write(word_vec + ' ')
                        continue;
            op1.close()
Exemple #27
0
#  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   
#  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          
#  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       
#  THIS SOFTWARE.
################################################################################

#  quick and dirty utility to print out binary files, for debugging

import sys
# import numpy
from io_funcs.binary_io import BinaryIOCollection

if __name__ == '__main__':

    ## shall we read the logging config file from command line?
    if len(sys.argv) < 3:
        print 'usage: python view.py dimension filename(s)'
        sys.exit(1)

    dimension = int(sys.argv[1])
    fnames = sys.argv[2:]

    print fnames
    
    io_funcs = BinaryIOCollection()
    for f in fnames:
        features = io_funcs.load_binary_file(f, dimension)

    print features.shape
    # print features
Exemple #28
0
    def extract_base_features(self, feat_dir_path, feat_switch, list_of_files,
                              decomposition_unit, unit_dim):
        ### load Binary module ###
        io_funcs = BinaryIOCollection()
        htsclass = readHTSlabelFile()

        ### read file by file ###
        for i in range(len(list_of_files)):
            filename = list_of_files[i]
            print filename

            binary_label_dir = feat_dir_path['input_binary']
            label_align_dir = feat_dir_path['input_labfile']
            txt_dir = feat_dir_path['input_txt']
            out_feat_dir = feat_dir_path['output_feat']

            in_filename = os.path.join(binary_label_dir, filename + '.lab')
            in_lab_file = os.path.join(label_align_dir, filename + '.lab')
            in_txt_file = os.path.join(txt_dir, filename + '.txt')
            out_filename = os.path.join(out_feat_dir, filename + '.lab')

            word_embed_list = []
            binary_feat_list = []
            identity_vec_list = []
            dur_feat_list = []
            dur_list = []

            ### read text file ###
            if feat_switch['wordEmbed']:
                ip1 = open(in_txt_file, 'r')
                text_Data = ip1.readlines()
                ip1.close()

                norm_text = self.format_text(text_Data[0].strip())
                norm_text = norm_text.replace('OUF', 'O U F')
                norm_text = norm_text.replace('Mmm', 'M m m')
                norm_text = norm_text.replace('USA', 'U S A')
                list_of_words = norm_text.split()

            ### read label file ###
            [phone, st_arr, ph_arr,
             mean_f0_arr] = htsclass.read_state_align_label_file(in_lab_file)
            file_len = len(phone)

            ### read binary label file ###
            features = io_funcs.load_binary_file(in_filename, 1)

            ### take non-silence region ###
            ph_start = int(ph_arr[0][1] / (np.power(10, 4) * 5))
            ph_end = int(ph_arr[1][file_len - 2] / (np.power(10, 4) * 5))

            ### extract duration features ###
            frame_feat_list = features.reshape(
                len(features) / unit_dim['frame'], unit_dim['frame'])
            frame_feat_list = frame_feat_list[ph_start:ph_end, :]
            dur_feat_list = frame_feat_list[:, -9:]

            ### initialise common variables ###
            num_of_frames = 0

            ### initialise syllable variables ###
            #frame_indx=0;
            syl_num_of_frames = 0
            wc = 0
            phinsyl = 0
            syl_identity = self.zeros(300, 1)
            syl = ''

            j = 0
            while j < file_len:
                #### ignore silence ####
                if (phone[j] == '#' or phone[j] == 'pau'):
                    j = j + 1
                    continue

                ### extract boundaries of phone ###
                ph_start = int(ph_arr[0][j] / (np.power(10, 4) * 5))
                ph_end = int(ph_arr[1][j] / (np.power(10, 4) * 5))
                num_of_frames = sum(st_arr[j][:] / (np.power(10, 4) * 5))
                mid_frame = (ph_start + ph_end) / 2

                ### syllable ending information ###
                syl_end = 0
                if (mean_f0_arr[j + 1][3] - mean_f0_arr[j][3] != 0):
                    syl_end = 1

                ### word ending information ###
                word_end = 0
                if (mean_f0_arr[j + 1][5] - mean_f0_arr[j][5] != 0):
                    word_end = 1

                ### syllable duration ###
                syl_num_of_frames += num_of_frames

                ### extract binary phone-level features ###
                st_indx = unit_dim['frame'] * mid_frame
                mid_frame_feat = features[st_indx:st_indx + 592]
                mid_frame_feat = np.reshape(mid_frame_feat,
                                            len(mid_frame_feat))

                ### word embedding features ###
                if feat_switch['wordEmbed']:
                    ### word embeddings for syllable ###
                    word = list_of_words[wc]
                    if (word_end and phone[j] != 'pau'):
                        wc += 1
                    if (phone[j] == 'pau'):
                        word_vec = self.wrd_embeds['*UNKNOWN*']
                    elif word in self.wrd_embeds:
                        word_vec = self.wrd_embeds[word]
                    elif word.lower() in self.wrd_embeds:
                        word_vec = self.wrd_embeds[word.lower()]
                    else:
                        word_vec = self.wrd_embeds['*UNKNOWN*']

                ### identity features ###
                if feat_switch['identity']:
                    ### phone identity features ###
                    ph_identity = mid_frame_feat[99:148]

                    if decomposition_unit == 'syllable':
                        ### syllable identity features
                        st_indx = phinsyl * 50
                        syl_identity[st_indx:st_indx + 49] = ph_identity
                        syl = syl + phone[j]
                        ### to make nucleus centre ###
                        #if phone[j] in self.vlist:
                        #    vow_index = phinsyl

                        ### if silence is allowed ###
                        #if phone[j] == '#':
                        #    syl_identity[(phinsyl+1)*50-1] = 1
                        phinsyl += 1

                #### select features depending on decomposition unit ###

                ### frame-level features ###
                if (decomposition_unit == 'frame'):

                    ### duration features for phone ###
                    dur_list.append(num_of_frames)

                    ### frame level binary features ###
                    if feat_switch['binary'] and j + 2 == file_len:
                        ### load normalisation statistics ###
                        label_norm_float_file = os.path.join(
                            binary_label_dir, '../label_norm_float_HTS.dat')
                        fid = open(label_norm_float_file, 'r')
                        arr12 = [float(x.strip()) for x in fid.readlines()]
                        fid.close()
                        min_vector = np.array(arr12[0:len(arr12) / 2])
                        max_vector = np.array(arr12[len(arr12) / 2:len(arr12)])
                        max_range_vector = max_vector - min_vector
                        max_range_vector[max_range_vector == 0] = 1

                        ### normalise features ###
                        nrows = len(frame_feat_list)
                        for x in xrange(nrows):
                            norm_frame_feat = (
                                frame_feat_list[x, :] -
                                min_vector) / max_range_vector * 0.98 + 0.01
                            norm_frame_vec = ' '.join(
                                map(str, norm_frame_feat[:]))
                            binary_feat_list.append(norm_frame_vec)

                    ### embedding features ###
                    if feat_switch['wordEmbed']:
                        for x in xrange(num_of_frames):
                            word_embed_list.append(word_vec)

                ### phone-level features ###
                if (decomposition_unit == 'phone'):

                    ### duration features for phone ###
                    dur_list.append(num_of_frames)

                    ### phone level binary features ###
                    if feat_switch['binary']:
                        #ph_feat = np.concatenate((mid_frame_feat[0:99], mid_frame_feat[348:]), axis=0)
                        norm_ph_feat = [
                            0.99 if x == 1 else 0.01 for x in mid_frame_feat
                        ]
                        norm_ph_vec = ' '.join(map(str, norm_ph_feat[:]))
                        binary_feat_list.append(norm_ph_vec)

                    ### embedding features ###
                    if feat_switch['wordEmbed']:
                        word_embed_list.append(word_vec)

                    ### phone-identity features ###
                    if feat_switch['identity']:
                        extra_ph = 1 if phone[j] == 'o~' else 0
                        ph_identity = np.append(ph_identity, extra_ph)
                        #norm_ph_identity = [0.99 if x==1 else 0.01 for x in ph_identity]
                        norm_ph_identity = [int(x) for x in ph_identity]
                        norm_ph_identity_vec = ' '.join(
                            map(str, norm_ph_identity[:]))
                        identity_vec_list.append(norm_ph_identity_vec)

                ### syllable level features ###
                if (decomposition_unit == 'syllable' and syl_end):
                    #print syl

                    ### duration features for syllable ###
                    dur_list.append(syl_num_of_frames)

                    ### syllable and above level binary features ###
                    if feat_switch['binary']:
                        syl_feat = []
                        for x in range(len(mid_frame_feat)):
                            if (x < 348 or (x >= 405 and x < 421)):
                                continue
                            syl_feat.append(mid_frame_feat[x])
                        norm_syl_feat = [
                            0.99 if x == 1 else 0.01 for x in syl_feat
                        ]
                        norm_syl_vec = ' '.join(map(str, norm_syl_feat[:]))
                        binary_feat_list.append(norm_syl_vec)

                    if feat_switch['wordEmbed']:
                        word_embed_list.append(word_vec)

                    ### syllable-identity features ###
                    if feat_switch['identity']:
                        ### to make nucleus centre ###
                        #if(vow_index<=1):
                        #    syl_identity = np.roll(syl_identity, 50*(vow_index+1))
                        norm_syl_identity = [
                            0.99 if x == 1 else 0.01 for x in syl_identity
                        ]
                        norm_syl_identity_vec = ' '.join(
                            map(str, norm_syl_identity[:]))
                        identity_vec_list.append(norm_syl_identity_vec)

                    ### reset syllable information ###
                    phinsyl = 0
                    syl = ''
                    syl_num_of_frames = 0
                    syl_identity = self.zeros(300, 1)

                j += 1

            ### default vectors to use ###
            if feat_switch['identity'] and decomposition_unit == 'syllable':
                syl_identity = self.zeros(300, 1)
                norm_syl_identity = [
                    0.99 if x == 1 else 0.01 for x in syl_identity
                ]
                norm_syl_identity_vec = ' '.join(map(str,
                                                     norm_syl_identity[:]))
            if feat_switch['wordEmbed']:
                word_vec = self.wrd_embeds['*UNKNOWN*']

            ### writing features to output file ###
            op1 = open(out_filename, 'w')
            num_of_vectors = max(len(binary_feat_list), len(identity_vec_list),
                                 len(word_embed_list))
            for x in range(num_of_vectors):
                ### initialise feat vector ###
                feat_vec = ''

                ### binary features ###
                if feat_switch['binary']:
                    feat_vec = feat_vec + binary_feat_list[x] + ' '

                ### word embeddings ###
                if feat_switch['wordEmbed']:
                    if feat_switch['wordEmbed'] >= 3:
                        if (x - 1 < 0):
                            feat_vec = feat_vec + word_vec + ' '
                        else:
                            feat_vec = feat_vec + word_embed_list[x - 1] + ' '
                    feat_vec = feat_vec + word_embed_list[x] + ' '
                    if feat_switch['wordEmbed'] >= 3:
                        if (x + 1 >= len(binary_feat_list)):
                            feat_vec = feat_vec + word_vec + ' '
                        else:
                            feat_vec = feat_vec + word_embed_list[x + 1] + ' '

                ### identity features ###
                if feat_switch['identity']:
                    if feat_switch['identity'] >= 5:
                        if (x - 2 < 0):
                            feat_vec = feat_vec + norm_syl_identity_vec + ' '
                        else:
                            feat_vec = feat_vec + identity_vec_list[x -
                                                                    2] + ' '
                    if feat_switch['identity'] >= 3:
                        if (x - 1 < 0):
                            feat_vec = feat_vec + norm_syl_identity_vec + ' '
                        else:
                            feat_vec = feat_vec + identity_vec_list[x -
                                                                    1] + ' '
                    feat_vec = feat_vec + identity_vec_list[x] + ' '
                    if feat_switch['identity'] >= 3:
                        if (x + 1 >= len(binary_feat_list)):
                            feat_vec = feat_vec + norm_syl_identity_vec + ' '
                        else:
                            feat_vec = feat_vec + identity_vec_list[x +
                                                                    1] + ' '
                    if feat_switch['identity'] >= 5:
                        if (x + 2 >= len(binary_feat_list)):
                            feat_vec = feat_vec + norm_syl_identity_vec + ' '
                        else:
                            feat_vec = feat_vec + identity_vec_list[x +
                                                                    2] + ' '
                op1.write(feat_vec + '\n')
                #for z in range(dur_list[x]):
                #    op1.write(feat_vec + ' '.join(map(str, dur_feat_list[frame_indx+z,:]))+'\n')
                #frame_indx+=dur_list[x]
            op1.close()
    def extract_base_features(self, feat_dir_path, feat_switch, list_of_files, decomposition_unit, unit_dim):
        ### load Binary module ###
        io_funcs = BinaryIOCollection()
        htsclass = readHTSlabelFile()
        
        ### read file by file ###
        for i in range(len(list_of_files)):    
            filename = list_of_files[i]     
            print filename
            
            binary_label_dir = feat_dir_path['input_binary']
            label_align_dir = feat_dir_path['input_labfile']
            txt_dir = feat_dir_path['input_txt']
            out_feat_dir = feat_dir_path['output_feat']
            
            in_filename = os.path.join(binary_label_dir, filename + '.lab');
            in_lab_file = os.path.join(label_align_dir, filename + '.lab')
            in_txt_file = os.path.join(txt_dir, filename + '.txt')
            out_filename = os.path.join(out_feat_dir, filename + '.lab');
            
            word_embed_list = []
            binary_feat_list = []
            identity_vec_list = []
            dur_feat_list = []
            dur_list = []
            
            ### read text file ###
            if feat_switch['wordEmbed']:
                ip1 = open(in_txt_file, 'r')
                text_Data = ip1.readlines()
                ip1.close()
                
                norm_text = self.format_text(text_Data[0].strip())
                norm_text = norm_text.replace('OUF', 'O U F')
                norm_text = norm_text.replace('Mmm', 'M m m')
                norm_text = norm_text.replace('USA', 'U S A')
                list_of_words = norm_text.split()

            ### read label file ###
            [phone, st_arr, ph_arr, mean_f0_arr] = htsclass.read_state_align_label_file(in_lab_file)
            file_len = len(phone)
            
            ### read binary label file ###
            features = io_funcs.load_binary_file(in_filename, 1)
            
            ### take non-silence region ###
            ph_start = int(ph_arr[0][1] / (np.power(10, 4) * 5));
            ph_end = int(ph_arr[1][file_len-2] / (np.power(10, 4) * 5));
            
            ### extract duration features ###
            frame_feat_list = features.reshape(len(features)/unit_dim['frame'], unit_dim['frame'])
            frame_feat_list = frame_feat_list[ph_start: ph_end, :]
            dur_feat_list   = frame_feat_list[:,-9:]
            
            ### initialise common variables ###
            num_of_frames=0;
            
            ### initialise syllable variables ###
            #frame_indx=0;
            syl_num_of_frames=0
            wc = 0; phinsyl=0;
            syl_identity = self.zeros(300,1)
            syl = ''
            
            j=0;
            while j < file_len: 
                #### ignore silence ####
                if(phone[j] == '#' or phone[j] == 'pau'):
                    j = j + 1
                    continue;            
                
                ### extract boundaries of phone ###
                ph_start = int(ph_arr[0][j] / (np.power(10, 4) * 5));
                ph_end = int(ph_arr[1][j] / (np.power(10, 4) * 5));
                num_of_frames = sum(st_arr[j][:]/(np.power(10,4)*5))
                mid_frame = (ph_start+ph_end)/2
                
                ### syllable ending information ###
                syl_end = 0        
                if(mean_f0_arr[j + 1][3] - mean_f0_arr[j][3] != 0):
                    syl_end = 1
                
                ### word ending information ###
                word_end = 0        
                if(mean_f0_arr[j + 1][5] - mean_f0_arr[j][5] != 0):
                    word_end = 1
                
                ### syllable duration ###
                syl_num_of_frames += num_of_frames
                
                ### extract binary phone-level features ###
                st_indx = unit_dim['frame']*mid_frame
                mid_frame_feat = features[st_indx:st_indx+592]
                mid_frame_feat = np.reshape(mid_frame_feat, len(mid_frame_feat))
                
                ### word embedding features ###
                if feat_switch['wordEmbed']:            
                    ### word embeddings for syllable ###
                    word = list_of_words[wc]
                    if(word_end and phone[j]!='pau'): 
                        wc += 1    
                    if(phone[j] == 'pau'):
                        word_vec = self.wrd_embeds['*UNKNOWN*']
                    elif word in self.wrd_embeds:
                        word_vec = self.wrd_embeds[word]
                    elif word.lower() in self.wrd_embeds:
                        word_vec = self.wrd_embeds[word.lower()]
                    else:
                        word_vec = self.wrd_embeds['*UNKNOWN*']
                
                ### identity features ###
                if feat_switch['identity']:                
                    ### phone identity features ###
                    ph_identity = mid_frame_feat[99:148]
                    
                    if decomposition_unit == 'syllable':
                        ### syllable identity features 
                        st_indx = phinsyl*50
                        syl_identity[st_indx:st_indx+49] = ph_identity
                        syl = syl + phone[j]
                        ### to make nucleus centre ###
                        #if phone[j] in self.vlist:
                        #    vow_index = phinsyl
                        
                        ### if silence is allowed ###
                        #if phone[j] == '#':
                        #    syl_identity[(phinsyl+1)*50-1] = 1
                        phinsyl += 1
                
                #### select features depending on decomposition unit ###
                
                ### frame-level features ###
                if(decomposition_unit=='frame'):
                    
                    ### duration features for phone ###
                    dur_list.append(num_of_frames)
                    
                    ### frame level binary features ###
                    if feat_switch['binary'] and j+2==file_len:
                        ### load normalisation statistics ###
                        label_norm_float_file = os.path.join(binary_label_dir, '../label_norm_float_HTS.dat');
                        fid = open(label_norm_float_file, 'r')
                        arr12 = [float(x.strip()) for x in fid.readlines()]
                        fid.close()
                        min_vector = np.array(arr12[0:len(arr12)/2])
                        max_vector = np.array(arr12[len(arr12)/2:len(arr12)])
                        max_range_vector = max_vector - min_vector
                        max_range_vector[max_range_vector==0] = 1
                        
                        ### normalise features ###
                        nrows = len(frame_feat_list)
                        for x in xrange(nrows):
                            norm_frame_feat = (frame_feat_list[x,:] - min_vector) / max_range_vector*0.98 + 0.01
                            norm_frame_vec = ' '.join(map(str, norm_frame_feat[:]))
                            binary_feat_list.append(norm_frame_vec)
                    
                    ### embedding features ###
                    if feat_switch['wordEmbed']:
                        for x in xrange(num_of_frames):
                            word_embed_list.append(word_vec)
                        
                ### phone-level features ###
                if(decomposition_unit=='phone'):
                    
                    ### duration features for phone ###
                    dur_list.append(num_of_frames)
                    
                    ### phone level binary features ###
                    if feat_switch['binary']:
                        #ph_feat = np.concatenate((mid_frame_feat[0:99], mid_frame_feat[348:]), axis=0)
                        norm_ph_feat = [0.99 if x==1 else 0.01 for x in mid_frame_feat]
                        norm_ph_vec = ' '.join(map(str, norm_ph_feat[:]))
                        binary_feat_list.append(norm_ph_vec)
                    
                    ### embedding features ###
                    if feat_switch['wordEmbed']:
                        word_embed_list.append(word_vec)
                    
                    ### phone-identity features ###
                    if feat_switch['identity']:
                        extra_ph = 1 if phone[j] == 'o~' else 0
                        ph_identity = np.append(ph_identity, extra_ph)
                        #norm_ph_identity = [0.99 if x==1 else 0.01 for x in ph_identity]
                        norm_ph_identity = [int(x) for x in ph_identity]
                        norm_ph_identity_vec = ' '.join(map(str, norm_ph_identity[:]))
                        identity_vec_list.append(norm_ph_identity_vec)
                
                
                ### syllable level features ###
                if(decomposition_unit=='syllable' and syl_end):
                    #print syl
                    
                    ### duration features for syllable ###
                    dur_list.append(syl_num_of_frames)
                    
                    ### syllable and above level binary features ###
                    if feat_switch['binary']:
                        syl_feat = []
                        for x in range(len(mid_frame_feat)):
                            if(x < 348 or (x >= 405 and x < 421)):
                                continue;
                            syl_feat.append(mid_frame_feat[x])
                        norm_syl_feat = [0.99 if x==1 else 0.01 for x in syl_feat]
                        norm_syl_vec = ' '.join(map(str, norm_syl_feat[:]))
                        binary_feat_list.append(norm_syl_vec)
                    
                    if feat_switch['wordEmbed']:
                        word_embed_list.append(word_vec)
                    
                    ### syllable-identity features ###
                    if feat_switch['identity']:
                        ### to make nucleus centre ###
                        #if(vow_index<=1):
                        #    syl_identity = np.roll(syl_identity, 50*(vow_index+1)) 
                        norm_syl_identity = [0.99 if x==1 else 0.01 for x in syl_identity]
                        norm_syl_identity_vec = ' '.join(map(str, norm_syl_identity[:]))
                        identity_vec_list.append(norm_syl_identity_vec)
                        
                    ### reset syllable information ###
                    phinsyl = 0; syl=''
                    syl_num_of_frames = 0 
                    syl_identity = self.zeros(300, 1)    
                
                j+=1                   
            
            ### default vectors to use ###
            if feat_switch['identity'] and decomposition_unit=='syllable': 
                syl_identity = self.zeros(300, 1)
                norm_syl_identity = [0.99 if x==1 else 0.01 for x in syl_identity]
                norm_syl_identity_vec = ' '.join(map(str, norm_syl_identity[:]))
            if feat_switch['wordEmbed']:
                word_vec = self.wrd_embeds['*UNKNOWN*']
                
            
            ### writing features to output file ###
            op1 = open(out_filename, 'w')
            num_of_vectors = max(len(binary_feat_list), len(identity_vec_list), len(word_embed_list))
            for x in range(num_of_vectors):
                ### initialise feat vector ###
                feat_vec = ''
                
                ### binary features ###
                if feat_switch['binary']:
                    feat_vec = feat_vec + binary_feat_list[x]+' '
                    
                ### word embeddings ###
                if feat_switch['wordEmbed']:
                    if feat_switch['wordEmbed']>=3:
                        if(x-1<0):
                            feat_vec = feat_vec + word_vec+' '
                        else:
                            feat_vec = feat_vec + word_embed_list[x-1]+' '
                    feat_vec = feat_vec + word_embed_list[x]+' '
                    if feat_switch['wordEmbed']>=3:
                        if(x+1>=len(binary_feat_list)):
                            feat_vec = feat_vec + word_vec+' '
                        else:
                            feat_vec = feat_vec + word_embed_list[x+1]+' '
                
                ### identity features ###
                if feat_switch['identity']:
                    if feat_switch['identity']>=5:
                        if(x-2<0):
                            feat_vec = feat_vec + norm_syl_identity_vec+' '
                        else:
                            feat_vec = feat_vec + identity_vec_list[x-2]+' '
                    if feat_switch['identity']>=3:
                        if(x-1<0):
                            feat_vec = feat_vec + norm_syl_identity_vec+' '
                        else:
                            feat_vec = feat_vec + identity_vec_list[x-1]+' '
                    feat_vec = feat_vec + identity_vec_list[x]+' '
                    if feat_switch['identity']>=3:
                        if(x+1>=len(binary_feat_list)):
                            feat_vec = feat_vec + norm_syl_identity_vec+' '
                        else:
                            feat_vec = feat_vec + identity_vec_list[x+1]+' '
                    if feat_switch['identity']>=5:
                        if(x+2>=len(binary_feat_list)):
                            feat_vec = feat_vec + norm_syl_identity_vec+' '
                        else:
                            feat_vec = feat_vec + identity_vec_list[x+2]+' '
                op1.write(feat_vec+'\n')
                #for z in range(dur_list[x]):
                #    op1.write(feat_vec + ' '.join(map(str, dur_feat_list[frame_indx+z,:]))+'\n')
                #frame_indx+=dur_list[x]
            op1.close()    
    def load_labels_with_phone_alignment(self, file_name, dur_file_name):

        # this is not currently used ??? -- it works now :D
        logger = logging.getLogger("labels")
        #logger.critical('unused function ???')
        #raise Exception

        if dur_file_name:
            io_funcs = BinaryIOCollection()
            dur_dim = 1 ## hard coded for now
            manual_dur_data = io_funcs.load_binary_file(dur_file_name, dur_dim)

        if self.add_frame_features:
            assert self.dimension == self.dict_size+self.frame_feature_size
        elif self.subphone_feats != 'none':
            assert self.dimension == self.dict_size+self.frame_feature_size
        else:
            assert self.dimension == self.dict_size

        label_feature_matrix = numpy.empty((100000, self.dimension))

        ph_count=0
        label_feature_index = 0
        with open(file_name) as fid:
            all_data = fid.readlines()
        for line in all_data:
            line = line.strip()
            if len(line) < 1:
                continue
            temp_list = re.split('\s+', line)
            
            if len(temp_list)==1:
                frame_number = 0
                full_label = temp_list[0]
            else:
                start_time = int(temp_list[0])
                end_time = int(temp_list[1])
                full_label = temp_list[2]

                # to do - support different frame shift - currently hardwired to 5msec
                # currently under beta testing: support different frame shift
                if dur_file_name:
                    frame_number = manual_dur_data[ph_count]
                else:
                    frame_number = int(end_time/50000) - int(start_time/50000)

                if self.subphone_feats == "coarse_coding":
                    cc_feat_matrix = self.extract_coarse_coding_features_relative(frame_number)

            ph_count = ph_count+1
            #label_binary_vector = self.pattern_matching(full_label)
            label_binary_vector = self.pattern_matching_binary(full_label)

            # if there is no CQS question, the label_continuous_vector will become to empty
            label_continuous_vector = self.pattern_matching_continous_position(full_label)
            label_vector = numpy.concatenate([label_binary_vector, label_continuous_vector], axis = 1)

            if self.add_frame_features:
                current_block_binary_array = numpy.zeros((frame_number, self.dict_size+self.frame_feature_size))
                for i in range(frame_number):
                    current_block_binary_array[i, 0:self.dict_size] = label_vector

                    if self.subphone_feats == 'minimal_phoneme':
                        ## features which distinguish frame position in phoneme
                        current_block_binary_array[i, self.dict_size] = float(i+1)/float(frame_number) # fraction through phone forwards
                        current_block_binary_array[i, self.dict_size+1] = float(frame_number - i)/float(frame_number) # fraction through phone backwards
                        current_block_binary_array[i, self.dict_size+2] = float(frame_number) # phone duration

                    elif self.subphone_feats == 'coarse_coding':
                        ## features which distinguish frame position in phoneme using three continous numerical features
                        current_block_binary_array[i, self.dict_size+0] = cc_feat_matrix[i, 0]
                        current_block_binary_array[i, self.dict_size+1] = cc_feat_matrix[i, 1]
                        current_block_binary_array[i, self.dict_size+2] = cc_feat_matrix[i, 2]
                        current_block_binary_array[i, self.dict_size+3] = float(frame_number)

                    elif self.subphone_feats == 'none':
                        pass

                    else:
                        sys.exit('unknown subphone_feats type')

                label_feature_matrix[label_feature_index:label_feature_index+frame_number,] = current_block_binary_array
                label_feature_index = label_feature_index + frame_number

            elif self.subphone_feats == 'none':
                current_block_binary_array = label_vector
                label_feature_matrix[label_feature_index:label_feature_index+1,] = current_block_binary_array
                label_feature_index = label_feature_index + 1

        label_feature_matrix = label_feature_matrix[0:label_feature_index,]

        logger.info('loaded %s, %3d labels' % (file_name, ph_count) )
        logger.debug('made label matrix of %d frames x %d labels' % label_feature_matrix.shape )
        return  label_feature_matrix
Exemple #31
0
#  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
#  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
#  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
#  THIS SOFTWARE.
################################################################################

#  quick and dirty utility to print out binary files, for debugging

import sys
# import numpy
from io_funcs.binary_io import BinaryIOCollection

if __name__ == '__main__':

    ## shall we read the logging config file from command line?
    if len(sys.argv) < 3:
        print 'usage: python view.py dimension filename(s)'
        sys.exit(1)

    dimension = int(sys.argv[1])
    fnames = sys.argv[2:]

    print fnames

    io_funcs = BinaryIOCollection()
    for f in fnames:
        features = io_funcs.load_binary_file(f, dimension)

    print features.shape
    # print features
    def load_labels_with_phone_alignment(self, file_name, dur_file_name):

        # this is not currently used ??? -- it works now :D
        logger = logging.getLogger("labels")
        #logger.critical('unused function ???')
        #raise Exception

        if dur_file_name:
            io_funcs = BinaryIOCollection()
            dur_dim = 1  ## hard coded for now
            manual_dur_data = io_funcs.load_binary_file(dur_file_name, dur_dim)

        if self.add_frame_features:
            assert self.dimension == self.dict_size + self.frame_feature_size
        elif self.subphone_feats != 'none':
            assert self.dimension == self.dict_size + self.frame_feature_size
        else:
            assert self.dimension == self.dict_size

        label_feature_matrix = numpy.empty((100000, self.dimension))

        ph_count = 0
        label_feature_index = 0
        fid = open(file_name)
        for line in fid.readlines():
            line = line.strip()
            if len(line) < 1:
                continue
            temp_list = re.split('\s+', line)

            if len(temp_list) == 1:
                frame_number = 0
                full_label = temp_list[2]
            else:
                start_time = int(temp_list[0])
                end_time = int(temp_list[1])
                full_label = temp_list[2]

                # to do - support different frame shift - currently hardwired to 5msec
                # currently under beta testing: support different frame shift
                if dur_file_name:
                    frame_number = manual_dur_data[ph_count]
                else:
                    frame_number = int((end_time - start_time) / 50000)

                if self.subphone_feats == "coarse_coding":
                    cc_feat_matrix = self.extract_coarse_coding_features_relative(
                        frame_number)

            ph_count = ph_count + 1
            #label_binary_vector = self.pattern_matching(full_label)
            label_binary_vector = self.pattern_matching_binary(full_label)

            # if there is no CQS question, the label_continuous_vector will become to empty
            label_continuous_vector = self.pattern_matching_continous_position(
                full_label)
            label_vector = numpy.concatenate(
                [label_binary_vector, label_continuous_vector], axis=1)

            if self.add_frame_features:
                current_block_binary_array = numpy.zeros(
                    (frame_number, self.dict_size + self.frame_feature_size))
                for i in range(frame_number):
                    current_block_binary_array[i,
                                               0:self.dict_size] = label_vector

                    if self.subphone_feats == 'minimal_phoneme':
                        ## features which distinguish frame position in phoneme
                        current_block_binary_array[
                            i, self.dict_size] = float(i + 1) / float(
                                frame_number
                            )  # fraction through phone forwards
                        current_block_binary_array[
                            i, self.dict_size +
                            1] = float(frame_number - i) / float(
                                frame_number
                            )  # fraction through phone backwards
                        current_block_binary_array[
                            i, self.dict_size + 2] = float(
                                frame_number)  # phone duration

                    elif self.subphone_feats == 'coarse_coding':
                        ## features which distinguish frame position in phoneme using three continous numerical features
                        current_block_binary_array[i, self.dict_size +
                                                   0] = cc_feat_matrix[i, 0]
                        current_block_binary_array[i, self.dict_size +
                                                   1] = cc_feat_matrix[i, 1]
                        current_block_binary_array[i, self.dict_size +
                                                   2] = cc_feat_matrix[i, 2]
                        current_block_binary_array[i, self.dict_size +
                                                   3] = float(frame_number)

                    elif self.subphone_feats == 'none':
                        pass

                    else:
                        sys.exit('unknown subphone_feats type')

                label_feature_matrix[
                    label_feature_index:label_feature_index +
                    frame_number, ] = current_block_binary_array
                label_feature_index = label_feature_index + frame_number

            elif self.subphone_feats == 'none':
                current_block_binary_array = label_vector
                label_feature_matrix[label_feature_index:label_feature_index +
                                     1, ] = current_block_binary_array
                label_feature_index = label_feature_index + 1

        fid.close()

        label_feature_matrix = label_feature_matrix[0:label_feature_index, ]

        logger.info('loaded %s, %3d labels' % (file_name, ph_count))
        logger.debug('made label matrix of %d frames x %d labels' %
                     label_feature_matrix.shape)
        return label_feature_matrix
def trim_silence(in_list, out_list, in_dimension, label_list, label_dimension, \
                 silence_feature_index, percent_to_keep=0):
    '''
    Function to trim silence from binary label/speech files based on binary labels.
        in_list: list of binary label/speech files to trim
        out_list: trimmed files
        in_dimension: dimension of data to trim
        label_list: list of binary labels which contain trimming criterion
        label_dimesion:
        silence_feature_index: index of feature in labels which is silence: 1 means silence (trim), 0 means leave.
    '''
    assert len(in_list) == len(out_list) == len(label_list)
    io_funcs = BinaryIOCollection()
    for (infile, outfile, label_file) in zip(in_list, out_list, label_list):

        data = io_funcs.load_binary_file(infile, in_dimension)
        label = io_funcs.load_binary_file(label_file, label_dimension)

        audio_label_difference = data.shape[0] - label.shape[0]
        assert math.fabs(audio_label_difference) < 3, '%s and %s contain different numbers of frames: %s %s' % (
            infile, label_file, data.shape[0], label.shape[0])

        ## In case they are different, resize -- keep label fixed as we assume this has
        ## already been processed. (This problem only arose with STRAIGHT features.)
        if audio_label_difference < 0:  ## label is longer -- pad audio to match by repeating last frame:
            print('audio too short -- pad')
            padding = numpy.vstack([data[-1, :]] * int(math.fabs(audio_label_difference)))
            data = numpy.vstack([data, padding])
        elif audio_label_difference > 0:  ## audio is longer -- cut it
            print('audio too long -- trim')
            new_length = label.shape[0]
            data = data[:new_length, :]
        # else: -- expected case -- lengths match, so do nothing

        silence_flag = label[:, silence_feature_index]
        #         print silence_flag
        if not (numpy.unique(silence_flag) == numpy.array([0, 1])).all():
            ## if it's all 0s or 1s, that's ok:
            assert (numpy.unique(silence_flag) == numpy.array([0]).all()) or \
                   (numpy.unique(silence_flag) == numpy.array([1]).all()), \
                'dimension %s of %s contains values other than 0 and 1' % (silence_feature_index, infile)
        print('Remove %d%% of frames (%s frames) as silence... ' % (
            100 * numpy.sum(silence_flag / float(len(silence_flag))), int(numpy.sum(silence_flag))))
        non_silence_indices = numpy.nonzero(
            silence_flag == 0)  ## get the indices where silence_flag == 0 is True (i.e. != 0)
        if percent_to_keep != 0:
            assert type(percent_to_keep) == int and percent_to_keep > 0
            # print silence_flag
            silence_indices = numpy.nonzero(silence_flag == 1)
            ## nonzero returns a tuple of arrays, one for each dimension of input array
            silence_indices = silence_indices[0]
            every_nth = 100 / percent_to_keep
            silence_indices_to_keep = silence_indices[::every_nth]  ## every_nth used +as step value in slice
            ## -1 due to weird error with STRAIGHT features at line 144:
            ## IndexError: index 445 is out of bounds for axis 0 with size 445
            if len(silence_indices_to_keep) == 0:
                silence_indices_to_keep = numpy.array([1])  ## avoid errors in case there is no silence
            print('   Restore %s%% (every %sth frame: %s frames) of silent frames' % (
                percent_to_keep, every_nth, len(silence_indices_to_keep)))

            ## Append to end of utt -- same function used for labels and audio
            ## means that violation of temporal order doesn't matter -- will be consistent.
            ## Later, frame shuffling will disperse silent frames evenly across minibatches:
            non_silence_indices = (numpy.hstack([non_silence_indices[0], silence_indices_to_keep]))
            ##  ^---- from tuple and back (see nonzero note above)

        trimmed_data = data[non_silence_indices, :]  ## advanced integer indexing
        io_funcs.array_to_binary_file(trimmed_data, outfile)
class provider(object):
    def __init__(self, list_path, dim_lab, dim_cmp, root_lab, root_cmp,
                 train_num, batch_size, mode, index):

        f_list = open(list_path, 'r')
        lines_list = f_list.readlines()
        f_list.close()

        self.dim_lab = dim_lab
        self.dim_cmp = dim_cmp
        self.list_labels = [
            root_lab + item.split()[0] + '.lab' for item in lines_list
        ]
        self.list_cmp = [
            root_cmp + item.split()[0] + '.cmp' for item in lines_list
        ]

        for i in range(0, len(self.list_labels)):
            assert self.list_labels[i].split('.')[0].split(
                '/')[-1] == self.list_cmp[i].split('.')[0].split('/')[-1]

        self.list_index = 0
        self.end_reading = False
        self.io_tool = BinaryIOCollection()
        self.batch_size = batch_size

        if mode == 'train':
            self.list_labels_using = self.list_labels[:train_num]
            self.list_cmp_using = self.list_cmp[:train_num]
            self.len_list = len(self.list_labels_using)
        if mode == 'valid':
            self.list_labels_using = self.list_labels[train_num:]
            self.list_cmp_using = self.list_cmp[train_num:]
            self.len_list = len(self.list_labels_using)

        if index == 1:
            self.index_array = np.asarray([1, 0, 0])
            self.index_array = np.tile(self.index_array.reshape(1, -1),
                                       (self.batch_size, 1)).astype(np.float32)
        elif index == 2:
            self.index_array = np.asarray([0, 1, 0])
            self.index_array = np.tile(self.index_array.reshape(1, -1),
                                       (self.batch_size, 1)).astype(np.float32)
        elif index == 3:
            self.index_array = np.asarray([0, 0, 1])
            self.index_array = np.tile(self.index_array.reshape(1, -1),
                                       (self.batch_size, 1)).astype(np.float32)

        else:
            raise Exception('out of index')

    def reset(self):

        self.list_index = 0
        self.end_reading = False
        c = list(zip(self.list_labels_using, self.list_cmp_using))
        random.shuffle(c)
        self.list_labels_using, self.list_cmp_using = zip(*c)
        self.list_labels_using = list(self.list_labels_using)
        self.list_cmp_using = list(self.list_cmp_using)

    def load_one_batch(self):

        list_input = []
        list_target = []

        for i in range(0, self.batch_size):

            #print self.list_labels_using[self.list_index]
            labs = self.io_tool.load_binary_file(
                self.list_labels_using[self.list_index], self.dim_lab)
            cmps = self.io_tool.load_binary_file(
                self.list_cmp_using[self.list_index], self.dim_cmp)

            assert labs.shape[0] == cmps.shape[0]

            list_input.append(labs.astype(np.float32))
            list_target.append(cmps.astype(np.float32))

            self.list_index += 1

            if self.list_index + self.batch_size - 1 >= self.len_list:

                self.end_reading = True

        return list_input, list_target, self.index_array