def keep_batch(self, batch_x: np.array, batch_y: np.array, preRight: np.array): self.checkNewTrainPath(preRight.__len__()) count = 0 keepRate = np.sum(preRight) / preRight.__len__() for index, value in enumerate(preRight): if value == 1.0: pass elif random.random() < keepRate: count += 1 continue label, imageArray = self.TrainPath.nextCaptcha(rotate=self.rotate, size=(self.width, self.height)) offset = self.labelLen - len(label) if offset > 0: label += ' ' * offset elif offset < 0: continue imageArray = self.img2gray(imageArray) batch_x[index, :] = self.getBatchX(imageArray) batch_y[index, :] = self.getBatchY(label) print( f">>> 图片准确率: {keepRate: <.3F} - 保留率为: {count}/{preRight.__len__()}" ) if keepRate >= self.accToStop: self.__accRightCount += 1 if self.__accRightCount >= self.stepToSaver: return None, None return batch_x, batch_y
def ph_based_trim( config, utt_id: str, text_ids: np.array, raw_text: str, audio: np.array, hop_size: int, ) -> (bool, np.array, np.array): """ Args: config: Parsed yaml config utt_id: file name text_ids: array with text ids raw_text: raw text of file audio: parsed wav file hop_size: Hop size Returns: (bool, np.array, np.array) => if trimmed return True, new text_ids, new audio_array """ os.makedirs(os.path.join(config["rootdir"], "trimmed-durations"), exist_ok=True) duration_path = config.get("duration_path", os.path.join(config["rootdir"], "durations")) duration_fixed_path = config.get( "duration_fixed_path", os.path.join(config["rootdir"], "trimmed-durations")) sil_ph = ["SIL", "END"] # TODO FIX hardcoded values text = raw_text.split(" ") trim_start, trim_end = False, False if text[0] in sil_ph: trim_start = True if text[-1] in sil_ph: trim_end = True if not trim_start and not trim_end: return False, text_ids, audio idx_start, idx_end = ( 0 if not trim_start else 1, text_ids.__len__() if not trim_end else -1, ) text_ids = text_ids[idx_start:idx_end] durations = np.load(os.path.join(duration_path, f"{utt_id}-durations.npy")) if trim_start: s_trim = int(durations[0] * hop_size) audio = audio[s_trim:] if trim_end: e_trim = int(durations[-1] * hop_size) audio = audio[:-e_trim] durations = durations[idx_start:idx_end] np.save(os.path.join(duration_fixed_path, f"{utt_id}-durations.npy"), durations) return True, text_ids, audio
def train_test_split(x: np.array, y: np.array = [], train_size: float = 0.75, random_state: int = 42, shuffle: bool = True) -> tuple: ''' Split arrays or matrices into random train and test subsets Parameters: - x: x array to split [numpy.array] - y: y array to split (like x) [numpy.array, default = []] - train_size: size of the train set [Float, default = 0.75] - random_state: random state for shuffling [Integer, default = 42] - shuffle: whether (=True, default) to shuffle or not (=False) [Boolean] Returns: - tuple containing [Tuple] - X_train: x train array [numpy.array] - X_test: x test array [numpy.array] - y_train: y train array [numpy.array, default = None] - y_test: y test array [numpy.array, default = None] ''' ## set random seed np.random.seed(random_state) ## get the index where dataset gets splitte idx = np.ceil(train_size * x.__len__()).astype(int) indices = [i for i in range(x.__len__())] if shuffle: indices = np.random.permutation(x.__len__()) train = indices[:idx] test = indices[idx:] ## when y array given if y.__len__() > 0: ## check length of both arrays assert len(x) == len(y), "x and y data streams must have same length" X_train = x[train] X_test = x[test] y_train = y[train] y_test = y[test] return (X_train, X_test, y_train, y_test) X_train = x[train] X_test = x[test] return (X_train, X_test)
def get_dendrogram_data(self, data:np.array, clusters:np.array, method:str = "single-linkage") -> np.array: ''' calculates the data that is needed by the scipy.cluster.hierarchy.dendrogram function to plot the desired dendrogram Parameters: - data: data points for the dendrogram (usually the cluster centers) [numpy.array] - clusters: labels of the respective clusters [numpy.array] - method: desired linkage type. Possible values are [String] - "single-linkage" (default) - "full-linkage" - "average" Returns: - dd_data: data of dendrogram [numpy.array] ''' ## make sure 'cluster names' are numeric and unique and a numpy.array clusters = np.array([i for i,_ in enumerate(np.unique(clusters))]) ## init the matrix with all the distances (as zeros) matrix = np.zeros((len(data), len(data))) ## go through all data points for i, point in enumerate(data): for j, other in enumerate(data): ## except the diagonal elements - the are always 0 if i!=j: ## calc the distance between all the points and set to respective matrix element matrix[i][j] = np.linalg.norm(point - other) ## replace diagonal elements with infitiy else: matrix[i][j] = np.infty ## matrix is symmetric matrix[j][i] = matrix[i][j] ## get the biggest 'cluster name' max_label = np.max(clusters) ##init children and distances children = [] distances = [] ## as linkage matrix contains len(clusters)-1 rows and 4 columns (child1, child2, distance, count) for i in range(clusters.__len__() - 1): ## get the indize of the two clusters with the lowest distance in between matched_clusters = np.where( matrix == np.min(matrix) ) matched_clusters = np.unique(matched_clusters,0)[-2:] ## recalculate the matrix matrix, new_distance = self.recalc_matrix(matrix, matched_clusters, method) ## check 'who' the new children are - sorted new_childs = sorted(clusters[matched_clusters]) ## if children is still empty, start with initial fill of new children if children.__len__() == 0: children = [new_childs] ## else, append the new children else: children.append(new_childs) ## replace the 'cluster names' of the already merged ones with new ones (just counting up) clusters[matched_clusters] = max_label + i + 1 ## add the current distance to the distances distances.append(new_distance) ## make sure children and distances are numpy.arrays children = np.array(children) distances = np.array(distances) ## get the counts counts = self.get_counts(children, data) ## set together the whole dendrogram data (being the scipy.cluster.hierarchy.linkage) ddata = np.column_stack([children, distances, counts]).astype(float) return ddata