def parse_librispeech_tester(strRootDir: str, strFileType: str = '.flac', nCountSample: int = 1000, nSamplingRate: int = 16000, nFFTCount: int = 512, nMelOrder: int = 24, nMFCCOrder: int = 13, nContextSize: int = 10, dWindowLength: float = 0.025, dShiftLength: float = 0.01, strFeatureType: str = "mfcc", strMode: str = "dvector" # dvector, gmm, ctc, las ): pDicFile = collections.defaultdict(list) pListTestFile = [] # Extract file names for strRoot, strDir, pListFileName in tqdm(os.walk(strRootDir)): iCount = 0 for strFileName in pListFileName: if splitext(strFileName)[1] == strFileType: strID = splitext(strFileName)[0].split('-')[0] pDicFile[strID].append(os.path.join(strRoot, strFileName)) iCount += 1 if iCount > nCountSample: break # Listing test and train dataset for i, pListFileName in pDicFile.items(): pListTestFile.extend(pListFileName[:int(len(pListFileName))]) # Labeling speakers for PyTorch Training pDicLabel = {} pListSpeakers = list(pDicFile.keys()) nSpeakersCount = len(pListSpeakers) for i in range(nSpeakersCount): pDicLabel[pListSpeakers[i]] = i # Transform data dictionary pDataTest = YoonDataset() for strFileName in pListTestFile: strID = splitext(basename(strFileName))[0].split('-')[0] pSpeech = YoonSpeech(strFileName=strFileName, nSamplingRate=nSamplingRate, strFeatureType=strFeatureType, nContextSize=nContextSize, nFFTCount=nFFTCount, nMelOrder=nMelOrder, nMFCCOrder=nMFCCOrder, dWindowLength=dWindowLength, dShiftLength=dShiftLength) pObject = YoonObject(nID=int(pDicLabel[strID]), strName=strID, strType=strFeatureType, pSpeech=pSpeech) pDataTest.append(pObject) if strMode == "dvector" or strMode == "gmm": nDimOutput = nSpeakersCount elif strMode == "ctc" or strMode == "las": nDimOutput = yoonspeech.DEFAULT_PHONEME_COUNT else: raise ValueError("Unsupported parsing mode") return nDimOutput, pDataTest
def train(pTrainData: YoonDataset, pTestData: YoonDataset, strModelPath: str): # Make dataset pTrainSet = pTrainData.to_gmm_dataset() pTestSet = pTestData.to_gmm_dataset() # Shuffle dataset numpy.random.shuffle(pTrainSet) numpy.random.shuffle(pTestSet) # GMM training nMixture = 4 pDicGMM = {} for i in range(len(pTrainSet)): pDicGMM[pTrainSet[i][1]] = sklearn.mixture.GaussianMixture( n_components=nMixture, random_state=48, covariance_type='diag') # The covariance type is "full" matrix if we use "Deltas" data for i in tqdm(range(len(pTrainSet))): pDicGMM[pTrainSet[i][1]].fit(pTrainSet[i][0]) # GMM test iAccuracy = 0 for i, (pInputData, nTargetLabel) in enumerate(pTestSet): pDicCandidateScore = {} # Calculate likelihood scores for all the trained GMMs. for jSpeaker in pDicGMM.keys(): pDicCandidateScore[jSpeaker] = pDicGMM[jSpeaker].score(pInputData) nLabelEstimated = max(pDicCandidateScore.keys(), key=(lambda key: pDicCandidateScore[key])) print("Estimated: {0}, Score: {1:.2f}, True: {2}".format( nLabelEstimated, pDicCandidateScore[nLabelEstimated], nTargetLabel), end=' ') if nTargetLabel == nLabelEstimated: print("Correct!") iAccuracy += 1 else: print("Incorrect...") print("Accuracy: {:.2f}".format(iAccuracy / len(pTestSet) * 100.0)) # Save GMM modeling with open(strModelPath, 'wb') as pFile: pickle.dump(pDicGMM, pFile) print("Save {} GMM models".format(len(pDicGMM)))
def recognition(pSpeech: YoonSpeech, strModelPath: str, nCountSpeakers: int): # Warp data set pTestData = YoonDataset(None, pSpeech) # Check if we can use a GPU device if torch.cuda.is_available(): pDevice = torch.device('cuda') else: pDevice = torch.device('cpu') # Load DVector model pModel = DVector(nDimInput=pTestData.get_dimension(), nDimOutput=nCountSpeakers).to(device=pDevice) pModel.eval() pFile = torch.load(strModelPath) pModel.load_state_dict(pFile['model']) # Recognition model pTensorInput = torch.from_numpy( pTestData[0].buffer).to(pDevice).unsqueeze(0) pArrayOutput = pModel(pTensorInput, bExtract=True).detach().cpu().numpy() nLabelEstimated = numpy.argmax(pArrayOutput, -1) # Index of maximum of output layer print("Estimated: {0}, Score : {1:.2f}".format(nLabelEstimated, numpy.max(pArrayOutput))) return nLabelEstimated
def test(pTestData: YoonDataset, strModelPath: str, nCountSpeakers: int): # Check if we can use a GPU device if torch.cuda.is_available(): pDevice = torch.device('cuda') else: pDevice = torch.device('cpu') print("{} device activation".format(pDevice.__str__())) # Load DVector model pModel = DVector(nDimInput=pTestData.get_dimension(), nDimOutput=nCountSpeakers).to(pDevice) # Check train data pModel.eval() pFile = torch.load(strModelPath) pModel.load_state_dict(pFile['model']) print("Successfully load the Model in path") # Define a data path for plot for test pDataSet = DvectorDataset(pTestData) pDataLoader = DataLoader(pDataSet, batch_size=1, shuffle=False, collate_fn=collate_dvector, num_workers=0, pin_memory=True) pBar = tqdm(pDataLoader) print("Length of data = ", len(pBar)) pListOutput = [] pListTarget = [] for i, (pTensorInput, pTensorTarget) in enumerate(pBar): pTensorInput = pTensorInput.type(torch.FloatTensor).to(pDevice) pTensorOutput = pModel(pTensorInput, bExtract=False) pListOutput.append(pTensorOutput.detach().cpu().numpy()) pListTarget.append(pTensorTarget.detach().cpu().numpy() [0]) # (Batch, Label) to (Label) # Prepare embeddings for plot pArrayOutput = numpy.concatenate(pListOutput) pArrayTarget = numpy.array(pListTarget) # Obtain embedding for the t-SNE plot pTSNE = TSNE(n_components=2) pArrayOutput = pArrayOutput.reshape(len(pArrayOutput), -1) pArrayTSNE = pTSNE.fit_transform(pArrayOutput) # Draw plot __draw_tSNE(pArrayTSNE, pArrayTarget)
def test(pTestData: YoonDataset, strModelPath: str): # Load GMM modeling with open(strModelPath, 'rb') as pFile: pDicGMM = pickle.load(pFile) pTestSet = pTestData.to_gmm_dataset() # GMM test iAccuracy = 0 for i, (pInputData, nTargetLabel) in enumerate(pTestSet): pDicCandidateScore = {} # Calculate likelihood scores for all the trained GMMs. for jSpeaker in pDicGMM.keys(): pDicCandidateScore[jSpeaker] = pDicGMM[jSpeaker].score(pInputData) nLabelEstimated = max(pDicCandidateScore.keys(), key=(lambda key: pDicCandidateScore[key])) print("Estimated: {0}, Score: {1:.2f}, True: {2}".format( nLabelEstimated, pDicCandidateScore[nLabelEstimated], nTargetLabel), end=' ') if nTargetLabel == nLabelEstimated: print("Correct!") iAccuracy += 1 else: print("Incorrect...") print("Accuracy: {:.2f}".format(iAccuracy / len(pTestSet) * 100.0))
def train(nEpoch: int, strModelPath: str, pTrainData: YoonDataset, pValidationData: YoonDataset, nCountSpeakers, bInitEpoch=False): dLearningRate = 0.01 # Check if we can use a GPU Device if torch.cuda.is_available(): pDevice = torch.device('cuda') else: pDevice = torch.device('cpu') print("{} device activation".format(pDevice.__str__())) # Define the training and testing data-set pTrainSet = DvectorDataset(pTrainData) pTrainLoader = DataLoader(pTrainSet, batch_size=8, shuffle=True, collate_fn=collate_dvector, num_workers=0, pin_memory=True) pValidationSet = DvectorDataset(pValidationData) pValidationLoader = DataLoader(pValidationSet, batch_size=1, shuffle=False, collate_fn=collate_dvector, num_workers=0, pin_memory=True) # Define a network model pModel = DVector(nDimInput=pTrainData.get_dimension(), nDimOutput=nCountSpeakers).to(pDevice) # Set the optimizer with adam pOptimizer = torch.optim.Adam(pModel.parameters(), lr=dLearningRate) # Set the training criterion pCriterion = torch.nn.CrossEntropyLoss(reduction='mean') # Load pre-trained model nStart = 0 print("Directory of the pre-trained model: {}".format(strModelPath)) if strModelPath is not None and os.path.exists( strModelPath) and bInitEpoch is False: pModelData = torch.load(strModelPath) nStart = pModelData['epoch'] pModel.load_state_dict(pModelData['model']) pOptimizer.load_state_dict(pModelData['optimizer']) print("## Successfully load the model at {} epochs!".format(nStart)) # Train and Test Repeat dMinLoss = 10000.0 nCountDecrease = 0 for iEpoch in range(nStart, nEpoch + 1): # Train the network __process_train(iEpoch, pModel=pModel, pDataLoader=pTrainLoader, pCriterion=pCriterion, pOptimizer=pOptimizer) # Test the network dLoss = __process_evaluate(pModel=pModel, pDataLoader=pValidationLoader, pCriterion=pCriterion) # Save the optimal model if dLoss < dMinLoss: dMinLoss = dLoss torch.save( { 'epoch': iEpoch, 'model': pModel.state_dict(), 'optimizer': pOptimizer.state_dict() }, strModelPath) nCountDecrease = 0 else: nCountDecrease += 1 # Decrease the learning rate by 2 when the test loss decrease 3 times in a row if nCountDecrease == 3: pDicOptimizerState = pOptimizer.state_dict() pDicOptimizerState['param_groups'][0]['lr'] /= 2 pOptimizer.load_state_dict(pDicOptimizerState) print('learning rate is divided by 2') nCountDecrease = 0
def parse_librispeech_trainer(root_dir: str, file_type: str = '.flac', sample_len: int = 1000, sample_rate: int = 16000, fft_count: int = 512, mel_order: int = 24, mfcc_order: int = 13, context_size: int = 10, win_len: float = 0.025, shift_len: float = 0.01, feature_type: str = "mfcc", train_rate: float = 0.8, mode: str = "dvector" # dvector, gmm, ctc, las ): def get_line_in_trans(file_path, id): with open(file_path) as pFile: lines = pFile.read().lower().split('\n')[:-1] for line in lines: if id in line: line = line.replace(id + ' ', "") return line def make_speech_buffer(file_path): speech = YoonSpeech(sample_rate=sample_rate, context_size=context_size, fft_count=fft_count, mel_order=mel_order, mfcc_order=mfcc_order, win_len=win_len, shift_len=shift_len) speech.load_sound_file(file_path) return speech feature_file_dic = collections.defaultdict(list) trans_file_dic = collections.defaultdict(dict) trans_files = [] test_files = [] # Extract file names for root, dir_, file_paths in tqdm(os.walk(root_dir)): i = 0 for path in file_paths: if splitext(path)[1] == file_type: id_ = splitext(path)[0].split('-')[0] feature_file_dic[id_].append(os.path.join(root, path)) i += 1 if i > sample_len: break elif splitext(path)[1] == ".txt": # Recognition the words id_, part = splitext(path)[0].split('-') part = part.replace(".trans", "") trans_file_dic[id_][part] = os.path.join(root, path) # Listing test and train dataset for i, file_paths in feature_file_dic.items(): trans_files.extend(file_paths[:int(len(file_paths) * train_rate)]) test_files.extend(file_paths[int(len(file_paths) * train_rate):]) # Labeling speakers for Speaker recognition speaker_dic = {} speakers = list(feature_file_dic.keys()) num_speakers = len(speakers) for i in range(num_speakers): speaker_dic[speakers[i]] = i # Transform data dictionary train_dataset = YoonDataset() eval_dataset = YoonDataset() for path in trans_files: basename_ = splitext(basename(path))[0] id_, part = basename_.split('-')[0], basename_.split('-')[1] word = get_line_in_trans(trans_file_dic[id_][part], basename_) speech = make_speech_buffer(path) obj = YoonObject(id=int(speaker_dic[id_]), name=id_, word=word, type=feature_type, speech=speech) train_dataset.append(obj) for path in test_files: basename_ = splitext(basename(path))[0] id_, part = basename_.split('-')[0], basename_.split('-')[1] word = get_line_in_trans(trans_file_dic[id_][part], basename_) speech = make_speech_buffer(path) obj = YoonObject(id=int(speaker_dic[id_]), name=id_, word=word, type=feature_type, speech=speech) eval_dataset.append(obj) print("Length of Train = {}".format(train_dataset.__len__())) print("Length of Test = {}".format(eval_dataset.__len__())) if mode == "dvector" or mode == "gmm": output_dim = num_speakers elif mode == "ctc" or mode == "las": output_dim = yoonspeech.DEFAULT_PHONEME_COUNT else: raise ValueError("Unsupported parsing mode") return output_dim, train_dataset, eval_dataset
def train( nEpoch: int, pTrainData: YoonDataset, pValidationData: YoonDataset, strModelPath="model_ctc.pth", nSizeBatch=8, bInitTrain=False, dLearningRate=0.01, nWorker=0, # 0 = CPU, 4 = CUDA ): # Set the device for running the CTC model # Check if we can use a GPU device if torch.cuda.is_available(): pDevice = torch.device('cuda') else: pDevice = torch.device('cpu') # Define a network architecture pModel = CTC(nDimInput=pTrainData.get_dimension(), nDimOutput=256, nCountClass=yoonspeech.DEFAULT_PHONEME_COUNT) pModel = pModel.to(pDevice) # Define an optimizer pOptimizer = Adam(pModel.parameters(), lr=dLearningRate) # Define a training criterion pCriterion = torch.nn.CTCLoss(blank=0) # Load the pre-trained model if you resume the training from the model nStart = 0 if os.path.isfile(strModelPath) and bInitTrain is False: pModelData = torch.load(strModelPath, map_location=pDevice) pModel.load_state_dict(pModelData['encoder']) pOptimizer.load_state_dict(pModelData['optimizer']) nStart = pModelData['epoch'] print("## Success to load the CTC model : epoch {}".format(nStart)) # Define training and test dataset pTrainDataset = ASRDataset(pTrainData) pTrainLoader = DataLoader(pTrainDataset, batch_size=nSizeBatch, collate_fn=collate_asrdata, shuffle=True, num_workers=nWorker, pin_memory=True) pValidDataset = ASRDataset(pValidationData) pValidLoader = DataLoader(pValidDataset, batch_size=nSizeBatch, collate_fn=collate_asrdata, shuffle=False, num_workers=nWorker, pin_memory=True) # Perform training / validation processing dMinLoss = 10000.0 nCountDecrease = 0 for iEpoch in range(nStart, nEpoch + 1): __process_train(iEpoch, pModel, pTrainLoader, pCriterion, pOptimizer) dValidLoss, dValidLER = __process_test(iEpoch, pModel, pTrainLoader, pCriterion) # Save the trained model at every 10 epochs if dMinLoss > dValidLoss: dMinLoss = dValidLoss torch.save( { 'epoch': iEpoch, 'encoder': pModel.state_dict(), 'optimizer': pOptimizer.state_dict() }, strModelPath) nCountDecrease = 0 else: nCountDecrease += 1 # Decrease the learning rate by 2 when the test loss decrease 5 times in a row if nCountDecrease == 5: pDicOptimizerState = pOptimizer.state_dict() pDicOptimizerState['param_groups'][0]['lr'] /= 2 pOptimizer.load_state_dict(pDicOptimizerState) print('learning rate is divided by 2') nCountDecrease = 0