def segmentClassification(data, model_name, model_type): if not os.path.isfile(model_name): print("fileClassification: input model_name not found!") return (-1, -1, -1) if model_type == 'knn': [classifier, MEAN, STD, classNames, mt_win, mt_step, st_win, st_step, compute_beat] = load_model_knn(model_name) else: [classifier, MEAN, STD, classNames, mt_win, mt_step, st_win, st_step, compute_beat] = load_model(model_name) [Fs, x] = 250000, audioBasicIO.stereo2mono(data) # feature extraction: [mt_features, s, _] = aF.mtFeatureExtraction(x, Fs, mt_win * Fs, mt_step * Fs, round(Fs * st_win), round(Fs * st_step)) mt_features = mt_features.mean(axis=1) # long term averaging of mid-term statistics if compute_beat: [beat, beatConf] = aF.beatExtraction(s, st_step) mt_features = numpy.append(mt_features, beat) mt_features = numpy.append(mt_features, beatConf) curFV = (mt_features - MEAN) / STD # normalization [Result, P] = classifierWrapper(classifier, model_type, curFV) # classification return Result, P, classNames
def process_files(recordings_dir, sub_dirs, file_ext='*.wav'): features, labels = np.empty((0, 261)), np.empty( 0 ) #193 is the number of features we get from extract_acoustic_features + 68 from pyAudioAnalysis for label, sub_dir in enumerate(sub_dirs): for fn in glob.glob(os.path.join(recordings_dir, sub_dir, file_ext)): #get features using pyAudioAnalysis [Fs, x ] = audioBasicIO.readAudioFile(fn) #open file for pyAudioAnalysis #Mid-term feature extraction - M, SD of short-term feature sequence (essentially stft, but we define the window) #mtFeatureExtraction(signal, Fs, mtWin, mtStep, stWin, stStep), also extracts short-term features, but we are not using them [mtF, stF] = audioFeatureExtraction.mtFeatureExtraction( x, Fs, 1 * Fs, 1 * Fs, 0.050 * Fs, 0.025 * Fs ) #50ms frame size with 25ms frame step (50% overlap) for stft mtF = np.reshape( mtF, -1) #get rid of one empty dimension, so we can np.hstack below #get features using librosa via extract_acoustic_features mfcc, chroma, mel, contrast, tonnetz = extract_acoustic_features( fn) #extract features here get_features = np.hstack( [mfcc, chroma, mel, contrast, tonnetz, mtF]) #we need all numbers in one vector so np.hstack # print(mfcc, chroma, mel, contrast,tonnetz, mtF) #look at each acoustic measure features = np.vstack([features, get_features ]) #each file is one row, so np.vstack labels = np.append( labels, fn.split('/')[2].split('_')[2] ) #class labels come from file names 006_food_1_.wav gives 1 print( fn ) #print file names as they get processed #split '/', then by '_', take the third element (after the seond '/' and '_') return np.array(features), np.array(labels, dtype=np.int)
def classifyNN(inputFile, modelName): [ Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT ] = loadModel(modelName) [Fs, x] = audioBasicIO.readAudioFile(inputFile) x = audioBasicIO.stereo2mono(x) if isinstance(x, int): return (-1, -1, -1) if x.shape[0] / float(Fs) <= mtWin: return (-1, -1, -1) [MidTermFeatures, s] = aF.mtFeatureExtraction(x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * stWin), round(Fs * stStep)) MidTermFeatures = MidTermFeatures.mean( axis=1) # long term averaging of mid-term statistics if computeBEAT: [beat, beatConf] = aF.beatExtraction(s, stStep) MidTermFeatures = numpy.append(MidTermFeatures, beat) MidTermFeatures = numpy.append(MidTermFeatures, beatConf) curFV = (MidTermFeatures - MEAN) / STD # normalization [Result, P] = classify(Classifier, curFV) return Result, P, classNames
def POST(self): x = web.input(myfile={}) filename = 'tmp/' + uuid.uuid4().hex + '.wav' file = open(filename, 'w+') file.seek(0) file.write(x['myfile'].value) file.close() [Fs, x] = audioBasicIO.readAudioFile(filename) #os.remove(filename) x = audioBasicIO.stereo2mono(x) [F, _] = audioFeatureExtraction.mtFeatureExtraction( x, Fs, round(Fs * 1.0), round(Fs * 1.0), round(Fs * 0.050), round(Fs * 0.050)) F = F.transpose() for vec in F: results = {} current_highest = "" current_highest_value = 0 vec = numpy.around(vec.astype(numpy.float), 6) current = model.getNN(vec) result = current[0][1].partition("_")[0] if result in results: results[result] = results[result] + 1 else: results[result] = 1 if results[result] > current_highest_value: current_highest_value = results[result] current_highest = result print results print current_highest raise web.seeother('/')
def perdict(files, file, modelName): #read audio file, convert to mono (if needed) [Fs, x] = audioBasicIO.readAudioFile(file) x = audioBasicIO.stereo2mono(x) if modelName: mtWin, mtStep, stWin, stStep = 1.0, 1.0, aT.shortTermWindow, aT.shortTermStep Classifier, MEAN, STD = loadSVModel(modelName) else: with open(SVMmodelName, 'rb') as fid: Classifier = cPickle.load(fid) MEAN = cPickle.load(fo) STD = cPickle.load(fo) mtWin, mtStep, stWin, stStep = 1.0, 1.0, aT.shortTermWindow, aT.shortTermStep #extract features from sample [MidTermFeatures, s] = aF.mtFeatureExtraction(x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * stWin), round(Fs * stStep)) MidTermFeatures = MidTermFeatures.mean( axis=1) # long term averaging of mid-term statistics curFV = (MidTermFeatures - MEAN) / STD # normalization #predict result = Classifier.predict(curFV.reshape(1, -1))[0] prob = Classifier.predict_proba(curFV.reshape(1, -1))[0] s = files[int(result)] return re.findall(r'\d+', s)[0] + " pills", prob
def POST(self): x = web.input(myfile={}) filename = 'tmp/'+uuid.uuid4().hex+'.wav' file = open(filename, 'w+') file.seek(0) file.write(x['myfile'].value) file.close() [Fs, x] = audioBasicIO.readAudioFile(filename); #os.remove(filename) x = audioBasicIO.stereo2mono(x) [F, _] = audioFeatureExtraction.mtFeatureExtraction(x, Fs, round(Fs*1.0), round(Fs * 1.0), round(Fs * 0.050), round(Fs * 0.050)) F = F.transpose() for vec in F: results={} current_highest = "" current_highest_value = 0 vec = numpy.around(vec.astype(numpy.float), 6) current = model.getNN(vec) result = current[0][1].partition("_")[0] if result in results: results[result] = results[result]+1 else: results[result] = 1 if results[result] > current_highest_value: current_highest_value = results[result] current_highest = result print results print current_highest raise web.seeother('/')
def fileRegression(inputFile, modelName, modelType): # Load classifier: if not os.path.isfile(inputFile): print("fileClassification: wav file not found!") return (-1, -1, -1) regressionModels = glob.glob(modelName + "_*") regressionModels2 = [] for r in regressionModels: if r[-5::] != "MEANS": regressionModels2.append(r) regressionModels = regressionModels2 regressionNames = [] for r in regressionModels: regressionNames.append(r[r.rfind("_") + 1::]) # FEATURE EXTRACTION # LOAD ONLY THE FIRST MODEL (for mtWin, etc) if modelType == 'svm' or modelType == "svm_rbf": [_, _, _, mtWin, mtStep, stWin, stStep, computeBEAT] = loadSVModel(regressionModels[0], True) elif modelType == 'randomforest': [_, _, _, mtWin, mtStep, stWin, stStep, computeBEAT] = loadRandomForestModel(regressionModels[0], True) # read audio file and convert to mono [Fs, x] = audioBasicIO.readAudioFile(inputFile) x = audioBasicIO.stereo2mono(x) # feature extraction: [MidTermFeatures, s] = aF.mtFeatureExtraction(x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * stWin), round(Fs * stStep)) # long term averaging of mid-term statistics MidTermFeatures = MidTermFeatures.mean(axis=1) if computeBEAT: [beat, beatConf] = aF.beatExtraction(s, stStep) MidTermFeatures = numpy.append(MidTermFeatures, beat) MidTermFeatures = numpy.append(MidTermFeatures, beatConf) # REGRESSION R = [] for ir, r in enumerate(regressionModels): if not os.path.isfile(r): print("fileClassification: input modelName not found!") return (-1, -1, -1) if modelType == 'svm' or modelType == "svm_rbf": [Model, MEAN, STD, mtWin, mtStep, stWin, stStep, computeBEAT] = loadSVModel(r, True) elif modelType == 'randomforest': [Model, MEAN, STD, mtWin, mtStep, stWin, stStep, computeBEAT] = loadRandomForestModel(r, True) curFV = (MidTermFeatures - MEAN) / STD # normalization # classification R.append(regressionWrapper(Model, modelType, curFV)) return R, regressionNames
def hmmSegmentation(wavFileName, hmmModelName, PLOT=False, gtFileName=""): [Fs, x] = audioBasicIO.readAudioFile( wavFileName) # read audio data try: fo = open(hmmModelName, "rb") except IOError: print("didn't find file") return try: hmm = cPickle.load(fo) classesAll = cPickle.load(fo) mtWin = cPickle.load(fo) mtStep = cPickle.load(fo) except: fo.close() fo.close() # Features = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050*Fs, # 0.050*Fs); # feature extraction [Features, _] = aF.mtFeatureExtraction( x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * 0.050), round(Fs * 0.050)) flagsInd = hmm.predict(Features.T) # apply model # for i in range(len(flagsInd)): # if classesAll[flagsInd[i]]=="silence": # flagsInd[i]=classesAll.index("speech") # plot results if os.path.isfile(gtFileName): [segStart, segEnd, segLabels] = readSegmentGT(gtFileName) flagsGT, classNamesGT = segs2flags(segStart, segEnd, segLabels, mtStep) flagsGTNew = [] # "align" labels with GT for j, fl in enumerate(flagsGT): if classNamesGT[flagsGT[j]] in classesAll: flagsGTNew.append(classesAll.index(classNamesGT[flagsGT[j]])) else: flagsGTNew.append(-1) CM = numpy.zeros((len(classNamesGT), len(classNamesGT))) flagsIndGT = numpy.array(flagsGTNew) for i in range(min(flagsInd.shape[0], flagsIndGT.shape[0])): CM[int(flagsIndGT[i]), int(flagsInd[i])] += 1 else: flagsIndGT = numpy.array([]) acc = plotSegmentationResults( flagsInd, flagsIndGT, classesAll, mtStep, not PLOT) if acc >= 0: print("Overall Accuracy: {0:.2f}".format(acc)) return (flagsInd, classNamesGT, acc, CM) else: return (flagsInd, classesAll, -1, -1)
def fileClassification(inputFile, model_name, model_type): # Load classifier: print("Loading Classifier") if not os.path.isfile(model_name): print("fileClassification: input model_name not found!") return (-1, -1, -1) if not os.path.isfile(inputFile): print("fileClassification: wav file not found!") return (-1, -1, -1) if model_type == 'knn': [ classifier, MEAN, STD, classNames, mt_win, mt_step, st_win, st_step, compute_beat ] = load_model_knn(model_name) else: [ classifier, MEAN, STD, classNames, mt_win, mt_step, st_win, st_step, compute_beat ] = load_model(model_name) print("Printing Classnames") print(classNames) [Fs, x] = audioBasicIO.readAudioFile( inputFile) # read audio file and convert to mono x = audioBasicIO.stereo2mono(x) if isinstance(x, int): # audio file IO problem return (-1, -1, -1) print('io problem') if x.shape[0] / float(Fs) <= mt_win: return (-1, -1, -1) # feature extraction: [mt_features, s, _] = aF.mtFeatureExtraction(x, Fs, mt_win * Fs, mt_step * Fs, round(Fs * st_win), round(Fs * st_step)) mt_features = mt_features.mean( axis=1) # long term averaging of mid-term statistics if compute_beat: [beat, beatConf] = aF.beatExtraction(s, st_step) mt_features = numpy.append(mt_features, beat) mt_features = numpy.append(mt_features, beatConf) curFV = (mt_features - MEAN) / STD # normalization [Result, P] = classifierWrapper(classifier, model_type, curFV) # classification return Result, P, classNames
def analyze(vocs): for voc in tqdm(vocs, desc="Vocalization Analysis: ", ascii=True): data, rate = voc['audio'], voc['framerate'] mt_feats, _, _ = audioFeatureExtraction.mtFeatureExtraction( data, rate, MT_WIN, MT_STEP, ST_WIN, ST_STEP) voc['spectral_entropy'] = np.mean(mt_feats[5]) voc['spectral_centroid'] = np.mean(mt_feats[3]) voc['rolloff_90'] = np.mean(mt_feats[7]) voc['rolloff_50'] = np.mean(mt_feats[8]) voc['rolloff_25'] = np.mean(mt_feats[9]) voc['rolloff_10'] = np.mean(mt_feats[10]) voc['zcr'] = np.mean(mt_feats[0]) voc['spectral_spread'] = np.mean(mt_feats[4])
def extractAudioFeatures(audioPath): # Check if file exists if not os.path.exists(audioPath): raise Exception('File not found!') # Extract features from audio file [Fs, x] = audioBasicIO.readAudioFile(audioPath) x = audioBasicIO.stereo2mono(x) mF, sF = audioFeatureExtraction.mtFeatureExtraction( x, Fs, len(x), len(x), Fs, Fs) res = list() for item in mF: res.append(item[0]) return res
def extract_mid_features(input_file): class_names = [os.path.basename(input_file)] features = [] fs, x = readAudioFile(input_file) x = stereo2mono(x) mt_size, mt_step, st_win, st_step = 1, 0.4, 0.025, 0.010 [mt_feats, st_feats, _] = mtFeatureExtraction(x, fs, mt_size * fs, mt_step * fs, round(st_win * fs), round(st_step * fs)) mtFeatureExtractionToFile(input_file, mt_size, mt_step, st_win, st_step, input_file, False, True, True) return mt_feats, st_feats, _
def hmmSegmentation(wav_file_name, hmm_model_name, plot_res=False, gt_file_name=""): [fs, x] = audioBasicIO.readAudioFile(wav_file_name) try: fo = open(hmm_model_name, "rb") except IOError: print("didn't find file") return try: hmm = cPickle.load(fo) classes_all = cPickle.load(fo) mt_win = cPickle.load(fo) mt_step = cPickle.load(fo) except: fo.close() fo.close() [Features, _, _] = aF.mtFeatureExtraction(x, fs, mt_win * fs, mt_step * fs, round(fs * 0.050), round(fs * 0.050)) flags_ind = hmm.predict(Features.T) # apply model if os.path.isfile(gt_file_name): [seg_start, seg_end, seg_labs] = readSegmentGT(gt_file_name) flags_gt, class_names_gt = segs2flags(seg_start, seg_end, seg_labs, mt_step) flagsGTNew = [] for j, fl in enumerate(flags_gt): # "align" labels with GT if class_names_gt[flags_gt[j]] in classes_all: flagsGTNew.append( classes_all.index(class_names_gt[flags_gt[j]])) else: flagsGTNew.append(-1) cm = numpy.zeros((len(classes_all), len(classes_all))) flags_ind_gt = numpy.array(flagsGTNew) for i in range(min(flags_ind.shape[0], flags_ind_gt.shape[0])): cm[int(flags_ind_gt[i]), int(flags_ind[i])] += 1 else: flags_ind_gt = numpy.array([]) acc = plotSegmentationResults(flags_ind, flags_ind_gt, classes_all, mt_step, not plot_res) if acc >= 0: print("Overall Accuracy: {0:.2f}".format(acc)) return (flags_ind, class_names_gt, acc, cm) else: return (flags_ind, classes_all, -1, -1)
def trainHMM_fromFile(wavFile, gtFile, hmmModelName, mtWin, mtStep): ''' This function trains a HMM model for segmentation-classification using a single annotated audio file ARGUMENTS: - wavFile: the path of the audio filename - gtFile: the path of the ground truth filename (a csv file of the form <segment start in seconds>,<segment end in seconds>,<segment label> in each row - hmmModelName: the name of the HMM model to be stored - mtWin: mid-term window size - mtStep: mid-term window step RETURNS: - hmm: an object to the resulting HMM - classNames: a list of classNames After training, hmm, classNames, along with the mtWin and mtStep values are stored in the hmmModelName file ''' [segStart, segEnd, segLabels] = readSegmentGT( gtFile) # read ground truth data # convert to fix-sized sequence of flags flags, classNames = segs2flags(segStart, segEnd, segLabels, mtStep) [Fs, x] = audioBasicIO.readAudioFile( wavFile) # read audio data #F = aF.stFeatureExtraction(x, Fs, 0.050*Fs, 0.050*Fs); [F, _] = aF.mtFeatureExtraction( x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * 0.050), round(Fs * 0.050)) # feature extraction # compute HMM statistics (priors, transition matrix, etc) startprob, transmat, means, cov = trainHMM_computeStatistics(F, flags) hmm = hmmlearn.hmm.GaussianHMM( startprob.shape[0], "diag") # hmm training hmm.startprob_ = startprob hmm.transmat_ = transmat hmm.means_ = means hmm.covars_ = cov # output to file fo = open(hmmModelName, "wb") cPickle.dump(hmm, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(classNames, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(mtWin, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(mtStep, fo, protocol=cPickle.HIGHEST_PROTOCOL) fo.close() return hmm, classNames
def bufferRegression(audioBuffer, sampleRate, model_name, model_type): # Load classifier: regression_models = glob.glob(model_name + "_*") regression_models2 = [] for r in regression_models: if r[-5::] != "MEANS": regression_models2.append(r) regression_models = regression_models2 regression_names = [] for r in regression_models: regression_names.append(r[r.rfind("_") + 1::]) # FEATURE EXTRACTION # LOAD ONLY THE FIRST MODEL (for mt_win, etc) if model_type == 'svm' or model_type == "svm_rbf" or model_type == 'randomforest': [_, _, _, mt_win, mt_step, st_win, st_step, compute_beat] = load_model(regression_models[0], True) Fs = sampleRate x = audioBuffer # feature extraction: [mt_features, s, _] = aF.mtFeatureExtraction(x, Fs, mt_win * Fs, mt_step * Fs, round(Fs * st_win), round(Fs * st_step)) mt_features = mt_features.mean( axis=1) # long term averaging of mid-term statistics if compute_beat: [beat, beatConf] = aF.beatExtraction(s, st_step) mt_features = numpy.append(mt_features, beat) mt_features = numpy.append(mt_features, beatConf) # REGRESSION R = [] for ir, r in enumerate(regression_models): if not os.path.isfile(r): print("fileClassification: input model_name not found!") return (-1, -1, -1) if model_type == 'svm' or model_type == "svm_rbf" \ or model_type == 'randomforest': [model, MEAN, STD, mt_win, mt_step, st_win, st_step, compute_beat] = \ load_model(r, True) curFV = (mt_features - MEAN) / STD # normalization R.append(regressionWrapper(model, model_type, curFV)) # classification return R, regression_names
def _feature_extractor(self, frames): if len(frames) < self.Win : frames = np.append(frames, [0]*(self.Win-len(frames))) frames = sound.normalize(frames, -20.0) frames = sound.apply_filter(self.FilterB, self.FilterA, frames) assert(len(frames) >= self.Win) [_mf, _f] = aF.mtFeatureExtraction(frames, self.sr, self.Win, self.Win/2, self.Win/4, self.Win/4) _f = _mf.transpose() return _f
def get_segments(self, file, part, video_id): try: audio_file = "aud.part" + str(part) + "." + file + ".wav" fs, s = aIO.readAudioFile(audio_file) af, _, afn = aF.mtFeatureExtraction(s, fs, int(0.5 * fs), int(0.5 * fs), int(0.1 * fs), int(0.1 * fs)) video_file = "vid.part" + str(part) + "." + file vf, t, vfn = self._video_extractor.extract_features(video_file) except Exception as e: print(e) return None else: segment_features = [] # construct segmentfeature vmean = vf.T.mean(axis=1) for i, val in enumerate(vmean): if isinstance(val, np.ndarray): val = val[0] feature = SegmentFeatures(value=val, seq_no=1, feature_id=self._feature_map[vfn[i]]) segment_features.append(feature) amean = af.mean(axis=1) for i, val in enumerate(amean): feature = SegmentFeatures(value=val, seq_no=1, feature_id=self._feature_map[afn[i]]) segment_features.append(feature) # construct segment with its segmentfeatures segment = Segment(video_id=video_id, start_sec=part, end_sec=self._splitter.get_segment_end(part), features=segment_features) return segment
def extractFeatures(fs, signal): ''' spf = wave.open('WaveFiles/test.wav', 'r') signal = spf.readframes(-1) fs = spf.getframerate() signal = np.fromstring(signal, 'int16') time = np.linspace(0,len(signal)/fs, num=len(signal)) ''' F, Y = audioFeatureExtraction.mtFeatureExtraction(signal, fs, 0.025 * fs, 0.025 * fs, 0.050 * fs, 0.025 * fs) #meanMFCC = getMeanMFCC(F) amplitudePeak = getAmplitudePeak(signal) numPeaks = getNumPeak(F[5, :]) maxPeak = getMaxPeak(signal, fs) centroid, spectrum = stSpectralCentroidAndSpread(signal, fs) #rolloff = stSpectralRollOff(signal, 0.85, fs) #maxFlux = np.amax(F[6]) #avgFlux = np.mean(F[6]) return [amplitudePeak, numPeaks, centroid, spectrum]
def get_features_from_wav(wav_path, sec): """ Samples audio by given time window :param wav_path: path to .wav file :param sec: float, sampling frame size in sec :return: pandas.DataFrame with sampled audio of shape (n_samples, frames_per_sample) """ rate, audio = wav.read(wav_path) short_frame = rate * sec mt_features = mtFeatureExtraction(audio, rate, mtWin=short_frame * 10, mtStep=short_frame, stWin=short_frame, stStep=short_frame) big_mat = np.vstack([mt_features[0], mt_features[1]]).T big_mat = StandardScaler().fit_transform(big_mat) big_df = pd.DataFrame(big_mat) colnames = ["pyAA{}".format(i) for i in range(big_mat.shape[1])] big_df.columns = colnames return big_df
def trainHMM_fromFile(wav_file, gt_file, hmm_model_name, mt_win, mt_step): """ This function trains a HMM model for segmentation-classification using a single annotated audio file ARGUMENTS: - wav_file: the path of the audio filename - gt_file: the path of the ground truth filename (a csv file of the form <segment start in seconds>, <segment end in seconds>,<segment label> in each row - hmm_model_name: the name of the HMM model to be stored - mt_win: mid-term window size - mt_step: mid-term window step RETURNS: - hmm: an object to the resulting HMM - class_names: a list of class_names After training, hmm, class_names, along with the mt_win and mt_step values are stored in the hmm_model_name file """ [seg_start, seg_end, seg_labs] = readSegmentGT(gt_file) flags, class_names = segs2flags(seg_start, seg_end, seg_labs, mt_step) [fs, x] = audioBasicIO.readAudioFile(wav_file) [F, _, _] = aF.mtFeatureExtraction(x, fs, mt_win * fs, mt_step * fs, round(fs * 0.050), round(fs * 0.050)) start_prob, transmat, means, cov = trainHMM_computeStatistics(F, flags) hmm = hmmlearn.hmm.GaussianHMM(start_prob.shape[0], "diag") hmm.startprob_ = start_prob hmm.transmat_ = transmat hmm.means_ = means hmm.covars_ = cov fo = open(hmm_model_name, "wb") cPickle.dump(hmm, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(class_names, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(mt_win, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(mt_step, fo, protocol=cPickle.HIGHEST_PROTOCOL) fo.close() return hmm, class_names
def __call__(self, input_file): (frame_rate, x) = aIO.readAudioFile(input_file) if frame_rate < 0: return None [feats, s] = aF.mtFeatureExtraction(aIO.stereo2mono(x), frame_rate, self.mt_win * frame_rate, self.mt_step * frame_rate, round(frame_rate * self.st_win), round(frame_rate * self.st_step)) feats = feats.mean(axis=1) feats = (feats - self.model_mean) / self.model_sd p = self.classifier.predict_proba(feats.reshape(1, -1))[0] out = dict(zip(self.class_names, map(float, p))) out.update({ "_frame_rate": float(frame_rate), "_duration_seconds": float(x.shape[0]) / frame_rate if (frame_rate > 0) else None }) return out
def bufferClassification(audioBuffer, sampleRate, model_name, model_type): # Load classifier: if model_type == 'knn': [ classifier, MEAN, STD, classNames, mt_win, mt_step, st_win, st_step, compute_beat ] = load_model_knn(model_name) else: [ classifier, MEAN, STD, classNames, mt_win, mt_step, st_win, st_step, compute_beat ] = load_model(model_name) if isinstance(audioBuffer, int): # audio buffer format problem print("bufferClassification: bad audio format!") return (-1, -1, -1) if audioBuffer.shape[0] / float(sampleRate) <= mt_win: print( "bufferClassification: too little audio to analyze with medium term window", mt_win) return (-1, -1, -1) # feature extraction: [mt_features, s, _] = aF.mtFeatureExtraction(audioBuffer, sampleRate, mt_win * sampleRate, mt_step * sampleRate, round(sampleRate * st_win), round(sampleRate * st_step)) mt_features = mt_features.mean( axis=1) # long term averaging of mid-term statistics if compute_beat: [beat, beatConf] = aF.beatExtraction(s, st_step) mt_features = numpy.append(mt_features, beat) mt_features = numpy.append(mt_features, beatConf) curFV = (mt_features - MEAN) / STD # normalization [Result, P] = classifierWrapper(classifier, model_type, curFV) # classification return Result, P, classNames
def trainHMM_fromDir(dirPath, hmm_model_name, mt_win, mt_step): ''' This function trains a HMM model for segmentation-classification using a where WAV files and .segment (ground-truth files) are stored ARGUMENTS: - dirPath: the path of the data diretory - hmm_model_name: the name of the HMM model to be stored - mt_win: mid-term window size - mt_step: mid-term window step RETURNS: - hmm: an object to the resulting HMM - class_names: a list of class_names After training, hmm, class_names, along with the mt_win and mt_step values are stored in the hmm_model_name file ''' flags_all = numpy.array([]) classes_all = [] for i, f in enumerate(glob.glob(dirPath + os.sep + '*.wav')): # for each WAV file wav_file = f gt_file = f.replace('.wav', '.segments') if not os.path.isfile(gt_file): continue [seg_start, seg_end, seg_labs] = readSegmentGT(gt_file) flags, class_names = segs2flags(seg_start, seg_end, seg_labs, mt_step) for c in class_names: # update class names: if c not in classes_all: classes_all.append(c) [fs, x] = audioBasicIO.readAudioFile(wav_file) [F, _, _] = aF.mtFeatureExtraction(x, fs, mt_win * fs, mt_step * fs, round(fs * 0.050), round(fs * 0.050)) lenF = F.shape[1] lenL = len(flags) min_sm = min(lenF, lenL) F = F[:, 0:min_sm] flags = flags[0:min_sm] flagsNew = [] for j, fl in enumerate(flags): # append features and labels flagsNew.append(classes_all.index(class_names[flags[j]])) flags_all = numpy.append(flags_all, numpy.array(flagsNew)) if i == 0: f_all = F else: f_all = numpy.concatenate((f_all, F), axis=1) start_prob, transmat, means, cov = trainHMM_computeStatistics( f_all, flags_all) # compute HMM statistics hmm = hmmlearn.hmm.GaussianHMM(start_prob.shape[0], "diag") # train HMM hmm.startprob_ = start_prob hmm.transmat_ = transmat hmm.means_ = means hmm.covars_ = cov fo = open(hmm_model_name, "wb") # save HMM model cPickle.dump(hmm, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(classes_all, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(mt_win, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(mt_step, fo, protocol=cPickle.HIGHEST_PROTOCOL) fo.close() return hmm, classes_all
#iterate over track while data != '': print("At second: ") print(counter * chunk_size) counter += 1 x_data.append(counter * chunk_size) #stream.write(data) array = _wav2array(wf.getnchannels(), wf.getsampwidth(), data) array = audioBasicIO.stereo2mono(array) #extract features MidTermFeatures = aF.mtFeatureExtraction(array, Fs, mtWin * Fs, mtStep * Fs, round(Fs * stWin), round(Fs * stStep)) MidTermFeatures = MidTermFeatures[0] #classify chunks to speech/music flags = [] Ps = [] flagsInd = [] for i in range( MidTermFeatures[0].shape[0] ): # for each feature vector (i.e. for each fix-sized segment): curFV = (MidTermFeatures[:, i] - MEAN) / STD # normalize current feature vector [Result, P] = aT.classifierWrapper(Classifier, modelType, curFV) # classify vector flagsInd.append(Result)
def pyAudioAnalysis_features(x, Fs): # [Fs, x] = audioBasicIO.readAudioFile(file_name) # stF = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.05 * Fs, 0.05 * Fs) mtF = audioFeatureExtraction.mtFeatureExtraction(x, Fs, 1 * Fs, 1 * Fs, 0.5 * Fs, 0.5 * Fs) return mtF
def speakerDiarization(fileName, numOfSpeakers, mtSize=2.0, mtStep=0.2, stWin=0.05, LDAdim=0, PLOT=False): ''' ARGUMENTS: - fileName: the name of the WAV file to be analyzed - numOfSpeakers the number of speakers (clusters) in the recording (<=0 for unknown) - mtSize (opt) mid-term window size - mtStep (opt) mid-term window step - stWin (opt) short-term window size - LDAdim (opt) LDA dimension (0 for no LDA) - PLOT (opt) 0 for not plotting the results 1 for plottingy ''' [Fs, x] = pyAudioAnalysis.audioBasicIO.readAudioFile(fileName) x = pyAudioAnalysis.audioBasicIO.stereo2mono(x) Duration = len(x) / Fs #[Classifier1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1] = aT.loadKNNModel(os.path.join("data","knnSpeakerAll")) #[Classifier2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2] = aT.loadKNNModel(os.path.join("data","knnSpeakerFemaleMale")) [Classifier1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1] = aT.loadKNNModel("pyAudioAnalysis/data/knnSpeakerAll") [Classifier2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2] = aT.loadKNNModel("pyAudioAnalysis/data/knnSpeakerFemaleMale") [MidTermFeatures, ShortTermFeatures] = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, mtStep * Fs, round(Fs * stWin), round(Fs*stWin * 0.5)) MidTermFeatures2 = numpy.zeros((MidTermFeatures.shape[0] + len(classNames1) + len(classNames2), MidTermFeatures.shape[1])) for i in range(MidTermFeatures.shape[1]): curF1 = (MidTermFeatures[:, i] - MEAN1) / STD1 curF2 = (MidTermFeatures[:, i] - MEAN2) / STD2 [Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1) [Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2) MidTermFeatures2[0:MidTermFeatures.shape[0], i] = MidTermFeatures[:, i] MidTermFeatures2[MidTermFeatures.shape[0]:MidTermFeatures.shape[0]+len(classNames1), i] = P1 + 0.0001 MidTermFeatures2[MidTermFeatures.shape[0] + len(classNames1)::, i] = P2 + 0.0001 MidTermFeatures = MidTermFeatures2 # TODO # SELECT FEATURES: #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20]; # SET 0A #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 99,100]; # SET 0B #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96, # 97,98, 99,100]; # SET 0C iFeaturesSelect = [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53] # SET 1A #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100]; # SET 1B #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100]; # SET 1C #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53]; # SET 2A #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100]; # SET 2B #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100]; # SET 2C #iFeaturesSelect = range(100); # SET 3 #MidTermFeatures += numpy.random.rand(MidTermFeatures.shape[0], MidTermFeatures.shape[1]) * 0.000000010 MidTermFeatures = MidTermFeatures[iFeaturesSelect, :] (MidTermFeaturesNorm, MEAN, STD) = aT.normalizeFeatures([MidTermFeatures.T]) MidTermFeaturesNorm = MidTermFeaturesNorm[0].T numOfWindows = MidTermFeatures.shape[1] # remove outliers: DistancesAll = numpy.sum(distance.squareform(distance.pdist(MidTermFeaturesNorm.T)), axis=0) MDistancesAll = numpy.mean(DistancesAll) iNonOutLiers = numpy.nonzero(DistancesAll < 1.2 * MDistancesAll)[0] # TODO: Combine energy threshold for outlier removal: #EnergyMin = numpy.min(MidTermFeatures[1,:]) #EnergyMean = numpy.mean(MidTermFeatures[1,:]) #Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0 #iNonOutLiers = numpy.nonzero(MidTermFeatures[1,:] > Thres)[0] #print iNonOutLiers perOutLier = (100.0 * (numOfWindows - iNonOutLiers.shape[0])) / numOfWindows MidTermFeaturesNormOr = MidTermFeaturesNorm MidTermFeaturesNorm = MidTermFeaturesNorm[:, iNonOutLiers] # LDA dimensionality reduction: if LDAdim > 0: #[mtFeaturesToReduce, _] = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, stWin * Fs, round(Fs*stWin), round(Fs*stWin)); # extract mid-term features with minimum step: mtWinRatio = int(round(mtSize / stWin)) mtStepRatio = int(round(stWin / stWin)) mtFeaturesToReduce = [] numOfFeatures = len(ShortTermFeatures) numOfStatistics = 2 #for i in range(numOfStatistics * numOfFeatures + 1): for i in range(numOfStatistics * numOfFeatures): mtFeaturesToReduce.append([]) for i in range(numOfFeatures): # for each of the short-term features: curPos = 0 N = len(ShortTermFeatures[i]) while (curPos < N): N1 = curPos N2 = curPos + mtWinRatio if N2 > N: N2 = N curStFeatures = ShortTermFeatures[i][N1:N2] mtFeaturesToReduce[i].append(numpy.mean(curStFeatures)) mtFeaturesToReduce[i+numOfFeatures].append(numpy.std(curStFeatures)) curPos += mtStepRatio mtFeaturesToReduce = numpy.array(mtFeaturesToReduce) mtFeaturesToReduce2 = numpy.zeros((mtFeaturesToReduce.shape[0] + len(classNames1) + len(classNames2), mtFeaturesToReduce.shape[1])) for i in range(mtFeaturesToReduce.shape[1]): curF1 = (mtFeaturesToReduce[:, i] - MEAN1) / STD1 curF2 = (mtFeaturesToReduce[:, i] - MEAN2) / STD2 [Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1) [Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2) mtFeaturesToReduce2[0:mtFeaturesToReduce.shape[0], i] = mtFeaturesToReduce[:, i] mtFeaturesToReduce2[mtFeaturesToReduce.shape[0]:mtFeaturesToReduce.shape[0] + len(classNames1), i] = P1 + 0.0001 mtFeaturesToReduce2[mtFeaturesToReduce.shape[0]+len(classNames1)::, i] = P2 + 0.0001 mtFeaturesToReduce = mtFeaturesToReduce2 mtFeaturesToReduce = mtFeaturesToReduce[iFeaturesSelect, :] #mtFeaturesToReduce += numpy.random.rand(mtFeaturesToReduce.shape[0], mtFeaturesToReduce.shape[1]) * 0.0000010 (mtFeaturesToReduce, MEAN, STD) = aT.normalizeFeatures([mtFeaturesToReduce.T]) mtFeaturesToReduce = mtFeaturesToReduce[0].T #DistancesAll = numpy.sum(distance.squareform(distance.pdist(mtFeaturesToReduce.T)), axis=0) #MDistancesAll = numpy.mean(DistancesAll) #iNonOutLiers2 = numpy.nonzero(DistancesAll < 3.0*MDistancesAll)[0] #mtFeaturesToReduce = mtFeaturesToReduce[:, iNonOutLiers2] Labels = numpy.zeros((mtFeaturesToReduce.shape[1], )); LDAstep = 1.0 LDAstepRatio = LDAstep / stWin #print LDAstep, LDAstepRatio for i in range(Labels.shape[0]): Labels[i] = int(i*stWin/LDAstepRatio); clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(n_components=LDAdim) clf.fit(mtFeaturesToReduce.T, Labels) MidTermFeaturesNorm = (clf.transform(MidTermFeaturesNorm.T)).T if numOfSpeakers <= 0: sRange = range(2, 10) else: sRange = [numOfSpeakers] clsAll = [] silAll = [] centersAll = [] for iSpeakers in sRange: k_means = sklearn.cluster.KMeans(n_clusters = iSpeakers) k_means.fit(MidTermFeaturesNorm.T) cls = k_means.labels_ means = k_means.cluster_centers_ # Y = distance.squareform(distance.pdist(MidTermFeaturesNorm.T)) clsAll.append(cls) centersAll.append(means) silA = []; silB = [] for c in range(iSpeakers): # for each speaker (i.e. for each extracted cluster) clusterPerCent = numpy.nonzero(cls==c)[0].shape[0] / float(len(cls)) if clusterPerCent < 0.020: silA.append(0.0) silB.append(0.0) else: MidTermFeaturesNormTemp = MidTermFeaturesNorm[:,cls==c] # get subset of feature vectors Yt = distance.pdist(MidTermFeaturesNormTemp.T) # compute average distance between samples that belong to the cluster (a values) silA.append(numpy.mean(Yt)*clusterPerCent) silBs = [] for c2 in range(iSpeakers): # compute distances from samples of other clusters if c2!=c: clusterPerCent2 = numpy.nonzero(cls==c2)[0].shape[0] / float(len(cls)) MidTermFeaturesNormTemp2 = MidTermFeaturesNorm[:,cls==c2] Yt = distance.cdist(MidTermFeaturesNormTemp.T, MidTermFeaturesNormTemp2.T) silBs.append(numpy.mean(Yt)*(clusterPerCent+clusterPerCent2)/2.0) silBs = numpy.array(silBs) silB.append(min(silBs)) # ... and keep the minimum value (i.e. the distance from the "nearest" cluster) silA = numpy.array(silA); silB = numpy.array(silB); sil = [] for c in range(iSpeakers): # for each cluster (speaker) sil.append( ( silB[c] - silA[c]) / (max(silB[c], silA[c])+0.00001) ) # compute silhouette silAll.append(numpy.mean(sil)) # keep the AVERAGE SILLOUETTE #silAll = silAll * (1.0/(numpy.power(numpy.array(sRange),0.5))) imax = numpy.argmax(silAll) # position of the maximum sillouette value nSpeakersFinal = sRange[imax] # optimal number of clusters # generate the final set of cluster labels # (important: need to retrieve the outlier windows: this is achieved by giving them the value of their nearest non-outlier window) cls = numpy.zeros((numOfWindows,)) for i in range(numOfWindows): j = numpy.argmin(numpy.abs(i-iNonOutLiers)) cls[i] = clsAll[imax][j] # Post-process method 1: hmm smoothing for i in range(1): startprob, transmat, means, cov = trainHMM_computeStatistics(MidTermFeaturesNormOr, cls) hmm = hmmlearn.hmm.GaussianHMM(startprob.shape[0], "diag") # hmm training hmm.startprob_ = startprob hmm.transmat_ = transmat hmm.means_ = means; hmm.covars_ = cov cls = hmm.predict(MidTermFeaturesNormOr.T) # Post-process method 2: median filtering: cls = scipy.signal.medfilt(cls, 13) cls = scipy.signal.medfilt(cls, 11) sil = silAll[imax] # final sillouette classNames = ["speaker{0:d}".format(c) for c in range(nSpeakersFinal)]; # load ground-truth if available gtFile = fileName.replace('.wav', '.segments'); # open for annotated file if os.path.isfile(gtFile): # if groundturh exists [segStart, segEnd, segLabels] = readSegmentGT(gtFile) # read GT data flagsGT, classNamesGT = segs2flags(segStart, segEnd, segLabels, mtStep) # convert to flags if PLOT: fig = plt.figure() if numOfSpeakers>0: ax1 = fig.add_subplot(111) else: ax1 = fig.add_subplot(211) ax1.set_yticks(numpy.array(range(len(classNames)))) ax1.axis((0, Duration, -1, len(classNames))) ax1.set_yticklabels(classNames) ax1.plot(numpy.array(range(len(cls)))*mtStep+mtStep/2.0, cls) if os.path.isfile(gtFile): if PLOT: ax1.plot(numpy.array(range(len(flagsGT)))*mtStep+mtStep/2.0, flagsGT, 'r') purityClusterMean, puritySpeakerMean = evaluateSpeakerDiarization(cls, flagsGT) print "{0:.1f}\t{1:.1f}".format(100*purityClusterMean, 100*puritySpeakerMean) if PLOT: plt.title("Cluster purity: {0:.1f}% - Speaker purity: {1:.1f}%".format(100*purityClusterMean, 100*puritySpeakerMean) ) if PLOT: plt.xlabel("time (seconds)") #print sRange, silAll if numOfSpeakers<=0: plt.subplot(212) plt.plot(sRange, silAll) plt.xlabel("number of clusters"); plt.ylabel("average clustering's sillouette"); plt.show() return cls
def speakerDiarization(fileName, sRange = xrange(2, 10), mtSize = 2.0, mtStep = 0.2, stWin = 0.05, LDAdim = 35): Fs, x = audioBasicIO.readAudioFile(fileName) x = audioBasicIO.stereo2mono(x) duration = len(x) / Fs Classifier1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1 = aT.loadKNNModel(os.path.join('/home/aaiijmrtt/Code/deepspeech/res/pyAudioAnalysis/data', 'knnSpeakerAll')) Classifier2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2 = aT.loadKNNModel(os.path.join('/home/aaiijmrtt/Code/deepspeech/res/pyAudioAnalysis/data', 'knnSpeakerFemaleMale')) MidTermFeatures, ShortTermFeatures = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, mtStep * Fs, round(Fs * stWin), round(Fs * stWin * 0.5)) MidTermFeatures2 = numpy.zeros((MidTermFeatures.shape[0] + len(classNames1) + len(classNames2), MidTermFeatures.shape[1])) for i in range(MidTermFeatures.shape[1]): curF1 = (MidTermFeatures[:, i] - MEAN1) / STD1 curF2 = (MidTermFeatures[:, i] - MEAN2) / STD2 Result, P1 = aT.classifierWrapper(Classifier1, 'knn', curF1) Result, P2 = aT.classifierWrapper(Classifier2, 'knn', curF2) MidTermFeatures2[0: MidTermFeatures.shape[0], i] = MidTermFeatures[:, i] MidTermFeatures2[MidTermFeatures.shape[0]: MidTermFeatures.shape[0] + len(classNames1), i] = P1 + 0.0001 MidTermFeatures2[MidTermFeatures.shape[0] + len(classNames1):, i] = P2 + 0.0001 MidTermFeatures = MidTermFeatures2 iFeaturesSelect = range(8, 21) + range(41, 54) MidTermFeatures = MidTermFeatures[iFeaturesSelect, :] MidTermFeaturesNorm, MEAN, STD = aT.normalizeFeatures([MidTermFeatures.T]) MidTermFeaturesNorm = MidTermFeaturesNorm[0].T numOfWindows = MidTermFeatures.shape[1] DistancesAll = numpy.sum(distance.squareform(distance.pdist(MidTermFeaturesNorm.T)), axis = 0) MDistancesAll = numpy.mean(DistancesAll) iNonOutLiers = numpy.nonzero(DistancesAll < 1.2 * MDistancesAll)[0] perOutLier = (100.0 * (numOfWindows - iNonOutLiers.shape[0])) / numOfWindows MidTermFeaturesNormOr = MidTermFeaturesNorm MidTermFeaturesNorm = MidTermFeaturesNorm[:, iNonOutLiers] if LDAdim > 0: mtWinRatio, mtStepRatio, mtFeaturesToReduce, numOfFeatures, numOfStatistics = int(round(mtSize / stWin)), int(round(stWin / stWin)), list(), len(ShortTermFeatures), 2 for i in range(numOfStatistics * numOfFeatures): mtFeaturesToReduce.append(list()) for i in range(numOfFeatures): curPos = 0 N = len(ShortTermFeatures[i]) while (curPos < N): N1, N2 = curPos, curPos + mtWinRatio if N2 > N: N2 = N curStFeatures = ShortTermFeatures[i][N1: N2] mtFeaturesToReduce[i].append(numpy.mean(curStFeatures)) mtFeaturesToReduce[i + numOfFeatures].append(numpy.std(curStFeatures)) curPos += mtStepRatio mtFeaturesToReduce = numpy.array(mtFeaturesToReduce) mtFeaturesToReduce2 = numpy.zeros((mtFeaturesToReduce.shape[0] + len(classNames1) + len(classNames2), mtFeaturesToReduce.shape[1])) for i in range(mtFeaturesToReduce.shape[1]): curF1 = (mtFeaturesToReduce[:, i] - MEAN1) / STD1 curF2 = (mtFeaturesToReduce[:, i] - MEAN2) / STD2 Result, P1 = aT.classifierWrapper(Classifier1, 'knn', curF1) Result, P2 = aT.classifierWrapper(Classifier2, 'knn', curF2) mtFeaturesToReduce2[0: mtFeaturesToReduce.shape[0], i] = mtFeaturesToReduce[:, i] mtFeaturesToReduce2[mtFeaturesToReduce.shape[0]: mtFeaturesToReduce.shape[0] + len(classNames1), i] = P1 + 0.0001 mtFeaturesToReduce2[mtFeaturesToReduce.shape[0] + len(classNames1):, i] = P2 + 0.0001 mtFeaturesToReduce = mtFeaturesToReduce2 mtFeaturesToReduce = mtFeaturesToReduce[iFeaturesSelect, :] mtFeaturesToReduce, MEAN, STD = aT.normalizeFeatures([mtFeaturesToReduce.T]) mtFeaturesToReduce = mtFeaturesToReduce[0].T Labels = numpy.zeros((mtFeaturesToReduce.shape[1], )) LDAstep = 1.0 LDAstepRatio = LDAstep / stWin for i in range(Labels.shape[0]): Labels[i] = int(i * stWin / LDAstepRatio) clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(n_components = LDAdim) clf.fit(mtFeaturesToReduce.T, Labels) MidTermFeaturesNorm = (clf.transform(MidTermFeaturesNorm.T)).T clsAll, silAll, centersAll = list(), list(), list() for iSpeakers in sRange: k_means = sklearn.cluster.KMeans(n_clusters = iSpeakers) k_means.fit(MidTermFeaturesNorm.T) cls = k_means.labels_ means = k_means.cluster_centers_ clsAll.append(cls) centersAll.append(means) silA, silB = list(), list() for c in range(iSpeakers): clusterPerCent = numpy.nonzero(cls == c)[0].shape[0] / float(len(cls)) if clusterPerCent < 0.02: silA.append(0.0) silB.append(0.0) else: MidTermFeaturesNormTemp = MidTermFeaturesNorm[:, cls == c] Yt = distance.pdist(MidTermFeaturesNormTemp.T) silA.append(numpy.mean(Yt) * clusterPerCent) silBs = list() for c2 in range(iSpeakers): if c2 != c: clusterPerCent2 = numpy.nonzero(cls == c2)[0].shape[0] / float(len(cls)) MidTermFeaturesNormTemp2 = MidTermFeaturesNorm[:, cls == c2] Yt = distance.cdist(MidTermFeaturesNormTemp.T, MidTermFeaturesNormTemp2.T) silBs.append(numpy.mean(Yt) * (clusterPerCent+clusterPerCent2) / 2.0) silBs = numpy.array(silBs) silB.append(min(silBs)) silA, silB, sil = numpy.array(silA), numpy.array(silB), list() for c in range(iSpeakers): sil.append((silB[c] - silA[c]) / (max(silB[c], silA[c]) + 0.00001)) silAll.append(numpy.mean(sil)) imax = numpy.argmax(silAll) nSpeakersFinal = sRange[imax] cls = numpy.zeros((numOfWindows, )) for i in range(numOfWindows): j = numpy.argmin(numpy.abs(i - iNonOutLiers)) cls[i] = clsAll[imax][j] startprob, transmat, means, cov = trainHMM(MidTermFeaturesNormOr, cls) hmm = hmmlearn.hmm.GaussianHMM(startprob.shape[0], 'diag') hmm.startprob_ = startprob hmm.transmat_ = transmat hmm.means_ = means hmm.covars_ = cov cls = hmm.predict(MidTermFeaturesNormOr.T) cls = scipy.signal.medfilt(cls, 13) cls = scipy.signal.medfilt(cls, 11) sil = silAll[imax] classNames = ['SPEAKER{0:d}'.format(c) for c in range(nSpeakersFinal)] return cls, classNames, duration, mtStep, silAll
def mtFileClassification(inputFile, modelName, modelType, plotResults=False, gtFile=""): ''' This function performs mid-term classification of an audio stream. Towards this end, supervised knowledge is used, i.e. a pre-trained classifier. ARGUMENTS: - inputFile: path of the input WAV file - modelName: name of the classification model - modelType: svm or knn depending on the classifier type - plotResults: True if results are to be plotted using matplotlib along with a set of statistics RETURNS: - segs: a sequence of segment's endpoints: segs[i] is the endpoint of the i-th segment (in seconds) - classes: a sequence of class flags: class[i] is the class ID of the i-th segment ''' if not os.path.isfile(modelName): print("mtFileClassificationError: input modelType not found!") return (-1, -1, -1, -1) # Load classifier: if (modelType == 'svm') or (modelType == 'svm_rbf'): [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = aT.loadSVModel(modelName) elif modelType == 'knn': [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = aT.loadKNNModel(modelName) elif modelType == 'randomforest': [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = aT.loadRandomForestModel(modelName) elif modelType == 'gradientboosting': [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = aT.loadGradientBoostingModel(modelName) elif modelType == 'extratrees': [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = aT.loadExtraTreesModel(modelName) if computeBEAT: print("Model " + modelName + " contains long-term music features (beat etc) and cannot be used in segmentation") return (-1, -1, -1, -1) [Fs, x] = audioBasicIO.readAudioFile(inputFile) # load input file if Fs == -1: # could not read file return (-1, -1, -1, -1) # convert stereo (if) to mono x = audioBasicIO.stereo2mono(x) Duration = len(x) / Fs # mid-term feature extraction: [MidTermFeatures, _] = aF.mtFeatureExtraction( x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * stWin), round(Fs * stStep)) flags = [] Ps = [] flagsInd = [] # for each feature vector (i.e. for each fix-sized segment): for i in range(MidTermFeatures.shape[1]): # normalize current feature vector curFV = (MidTermFeatures[:, i] - MEAN) / STD [Result, P] = aT.classifierWrapper( Classifier, modelType, curFV) # classify vector flagsInd.append(Result) # update class label matrix flags.append(classNames[int(Result)]) # update probability matrix Ps.append(numpy.max(P)) flagsInd = numpy.array(flagsInd) # 1-window smoothing for i in range(1, len(flagsInd) - 1): if flagsInd[i - 1] == flagsInd[i + 1]: flagsInd[i] = flagsInd[i + 1] # convert fix-sized flags to segments and classes (segs, classes) = flags2segs(flags, mtStep) segs[-1] = len(x) / float(Fs) # Load grount-truth: if os.path.isfile(gtFile): [segStartGT, segEndGT, segLabelsGT] = readSegmentGT(gtFile) flagsGT, classNamesGT = segs2flags( segStartGT, segEndGT, segLabelsGT, mtStep) flagsIndGT = [] for j, fl in enumerate(flagsGT): # "align" labels with GT if classNamesGT[flagsGT[j]] in classNames: flagsIndGT.append(classNames.index(classNamesGT[flagsGT[j]])) else: flagsIndGT.append(-1) flagsIndGT = numpy.array(flagsIndGT) CM = numpy.zeros((len(classNamesGT), len(classNamesGT))) for i in range(min(flagsInd.shape[0], flagsIndGT.shape[0])): CM[int(flagsIndGT[i]), int(flagsInd[i])] += 1 else: CM = [] flagsIndGT = numpy.array([]) acc = plotSegmentationResults( flagsInd, flagsIndGT, classNames, mtStep, not plotResults) if acc >= 0: print("Overall Accuracy: {0:.3f}".format(acc)) return (flagsInd, classNamesGT, acc, CM) else: return (flagsInd, classNames, acc, CM)
def mtFileClassification(input_file, model_name, model_type, plot_results=False, gt_file=""): ''' This function performs mid-term classification of an audio stream. Towards this end, supervised knowledge is used, i.e. a pre-trained classifier. ARGUMENTS: - input_file: path of the input WAV file - model_name: name of the classification model - model_type: svm or knn depending on the classifier type - plot_results: True if results are to be plotted using matplotlib along with a set of statistics RETURNS: - segs: a sequence of segment's endpoints: segs[i] is the endpoint of the i-th segment (in seconds) - classes: a sequence of class flags: class[i] is the class ID of the i-th segment ''' if not os.path.isfile(model_name): print("mtFileClassificationError: input model_type not found!") return (-1, -1, -1, -1) # Load classifier: if model_type == "knn": [classifier, MEAN, STD, class_names, mt_win, mt_step, st_win, st_step, compute_beat] = \ aT.load_model_knn(model_name) else: [ classifier, MEAN, STD, class_names, mt_win, mt_step, st_win, st_step, compute_beat ] = aT.load_model(model_name) if compute_beat: print("Model " + model_name + " contains long-term music features " "(beat etc) and cannot be used in " "segmentation") return (-1, -1, -1, -1) [fs, x] = audioBasicIO.readAudioFile(input_file) # load input file if fs == -1: # could not read file return (-1, -1, -1, -1) x = audioBasicIO.stereo2mono(x) # convert stereo (if) to mono duration = len(x) / fs # mid-term feature extraction: [mt_feats, _, _] = aF.mtFeatureExtraction(x, fs, mt_win * fs, mt_step * fs, round(fs * st_win), round(fs * st_step)) flags = [] Ps = [] flags_ind = [] for i in range( mt_feats.shape[1] ): # for each feature vector (i.e. for each fix-sized segment): cur_fv = (mt_feats[:, i] - MEAN) / STD # normalize current feature vector [res, P] = aT.classifierWrapper(classifier, model_type, cur_fv) # classify vector flags_ind.append(res) flags.append(class_names[int(res)]) # update class label matrix Ps.append(numpy.max(P)) # update probability matrix flags_ind = numpy.array(flags_ind) # 1-window smoothing for i in range(1, len(flags_ind) - 1): if flags_ind[i - 1] == flags_ind[i + 1]: flags_ind[i] = flags_ind[i + 1] # convert fix-sized flags to segments and classes (segs, classes) = flags2segs(flags, mt_step) segs[-1] = len(x) / float(fs) # Load grount-truth: if os.path.isfile(gt_file): [seg_start_gt, seg_end_gt, seg_l_gt] = readSegmentGT(gt_file) flags_gt, class_names_gt = segs2flags(seg_start_gt, seg_end_gt, seg_l_gt, mt_step) flags_ind_gt = [] for j, fl in enumerate(flags_gt): # "align" labels with GT if class_names_gt[flags_gt[j]] in class_names: flags_ind_gt.append( class_names.index(class_names_gt[flags_gt[j]])) else: flags_ind_gt.append(-1) flags_ind_gt = numpy.array(flags_ind_gt) cm = numpy.zeros((len(class_names_gt), len(class_names_gt))) for i in range(min(flags_ind.shape[0], flags_ind_gt.shape[0])): cm[int(flags_ind_gt[i]), int(flags_ind[i])] += 1 else: cm = [] flags_ind_gt = numpy.array([]) acc = plotSegmentationResults(flags_ind, flags_ind_gt, class_names, mt_step, not plot_results) if acc >= 0: print("Overall Accuracy: {0:.3f}".format(acc)) return (flags_ind, class_names_gt, acc, cm) else: return (flags_ind, class_names, acc, cm)
def emotion_from_speech(Fs, x, log, model_name="pyAudioAnalysis/pyAudioAnalysis/data/svmSpeechEmotion", model_type="svm"): """ :param Fs: frame rate :param x: data :param model_name: :param model_type: :param log: :return: """ regression_models = glob.glob(model_name + "_*") regression_models2 = [] for r in regression_models: if r[-5::] != "MEANS": regression_models2.append(r) regression_models = regression_models2 regression_names = [] for r in regression_models: regression_names.append(r[r.rfind("_")+1::]) emotion = {"valence": None, "arousal":None} # Feature extraction x = np.fromstring(x, np.int16) if model_type == 'svm' or model_type == "svm_rbf" or model_type == 'randomforest': [_, _, _, mt_win, mt_step, st_win, st_step, compute_beat] = aT.load_model(regression_models[0], True) else: return emotion [mt_features, s, _] = aF.mtFeatureExtraction(x, Fs, mt_win * Fs, mt_step * Fs, round(Fs * st_win), round(Fs * st_step)) mt_features = mt_features.mean(axis=1) # long term averaging of mid-term statistics if compute_beat: [beat, beatConf] = aF.beatExtraction(s, st_step) mt_features = np.append(mt_features, beat) mt_features = np.append(mt_features, beatConf) # Regression R = [] for ir, r in enumerate(regression_models): if not os.path.isfile(r): print("fileClassification: input model_name not found!") return emotion if model_type == 'svm' or model_type == "svm_rbf" or model_type == 'randomforest': [model, MEAN, STD, mt_win, mt_step, st_win, st_step, compute_beat] = aT.load_model(r, True) curFV = (mt_features - MEAN) / STD # normalization R.append(aT.regressionWrapper(model, model_type, curFV)) if R[0] > 1: log.warning("Valence > 1") emotion["valence"] = 1 elif R[0] < -1: log.warning("Valence < -1") emotion["valence"] = -1 else: emotion["valence"] = R[0] if R[1] > 1: log.warning("Arousal > 1") emotion["arousal"] = 1 elif R[1] < -1: log.warning("Arousal < -1") emotion["arousal"] = -1 else: emotion["arousal"] = R[1] return emotion
def speakerDiarization(filename, n_speakers, mt_size=2.0, mt_step=0.2, st_win=0.05, lda_dim=35, plot_res=False): ''' ARGUMENTS: - filename: the name of the WAV file to be analyzed - n_speakers the number of speakers (clusters) in the recording (<=0 for unknown) - mt_size (opt) mid-term window size - mt_step (opt) mid-term window step - st_win (opt) short-term window size - lda_dim (opt) LDA dimension (0 for no LDA) - plot_res (opt) 0 for not plotting the results 1 for plottingy ''' [fs, x] = audioBasicIO.readAudioFile(filename) x = audioBasicIO.stereo2mono(x) duration = len(x) / fs [ classifier_1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1 ] = aT.load_model_knn( os.path.join(os.path.dirname(os.path.realpath(__file__)), "data", "knnSpeakerAll")) [ classifier_2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2 ] = aT.load_model_knn( os.path.join(os.path.dirname(os.path.realpath(__file__)), "data", "knnSpeakerFemaleMale")) [mt_feats, st_feats, _] = aF.mtFeatureExtraction(x, fs, mt_size * fs, mt_step * fs, round(fs * st_win), round(fs * st_win * 0.5)) MidTermFeatures2 = numpy.zeros( (mt_feats.shape[0] + len(classNames1) + len(classNames2), mt_feats.shape[1])) for i in range(mt_feats.shape[1]): cur_f1 = (mt_feats[:, i] - MEAN1) / STD1 cur_f2 = (mt_feats[:, i] - MEAN2) / STD2 [res, P1] = aT.classifierWrapper(classifier_1, "knn", cur_f1) [res, P2] = aT.classifierWrapper(classifier_2, "knn", cur_f2) MidTermFeatures2[0:mt_feats.shape[0], i] = mt_feats[:, i] MidTermFeatures2[mt_feats.shape[0]:mt_feats.shape[0] + len(classNames1), i] = P1 + 0.0001 MidTermFeatures2[mt_feats.shape[0] + len(classNames1)::, i] = P2 + 0.0001 mt_feats = MidTermFeatures2 # TODO iFeaturesSelect = [ 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53 ] mt_feats = mt_feats[iFeaturesSelect, :] (mt_feats_norm, MEAN, STD) = aT.normalizeFeatures([mt_feats.T]) mt_feats_norm = mt_feats_norm[0].T n_wins = mt_feats.shape[1] # remove outliers: dist_all = numpy.sum(distance.squareform(distance.pdist(mt_feats_norm.T)), axis=0) m_dist_all = numpy.mean(dist_all) i_non_outliers = numpy.nonzero(dist_all < 1.2 * m_dist_all)[0] # TODO: Combine energy threshold for outlier removal: #EnergyMin = numpy.min(mt_feats[1,:]) #EnergyMean = numpy.mean(mt_feats[1,:]) #Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0 #i_non_outliers = numpy.nonzero(mt_feats[1,:] > Thres)[0] #print i_non_outliers perOutLier = (100.0 * (n_wins - i_non_outliers.shape[0])) / n_wins mt_feats_norm_or = mt_feats_norm mt_feats_norm = mt_feats_norm[:, i_non_outliers] # LDA dimensionality reduction: if lda_dim > 0: #[mt_feats_to_red, _, _] = aF.mtFeatureExtraction(x, fs, mt_size * fs, st_win * fs, round(fs*st_win), round(fs*st_win)); # extract mid-term features with minimum step: mt_win_ratio = int(round(mt_size / st_win)) mt_step_ratio = int(round(st_win / st_win)) mt_feats_to_red = [] num_of_features = len(st_feats) num_of_stats = 2 #for i in range(num_of_stats * num_of_features + 1): for i in range(num_of_stats * num_of_features): mt_feats_to_red.append([]) for i in range( num_of_features): # for each of the short-term features: curPos = 0 N = len(st_feats[i]) while (curPos < N): N1 = curPos N2 = curPos + mt_win_ratio if N2 > N: N2 = N curStFeatures = st_feats[i][N1:N2] mt_feats_to_red[i].append(numpy.mean(curStFeatures)) mt_feats_to_red[i + num_of_features].append( numpy.std(curStFeatures)) curPos += mt_step_ratio mt_feats_to_red = numpy.array(mt_feats_to_red) mt_feats_to_red_2 = numpy.zeros( (mt_feats_to_red.shape[0] + len(classNames1) + len(classNames2), mt_feats_to_red.shape[1])) for i in range(mt_feats_to_red.shape[1]): cur_f1 = (mt_feats_to_red[:, i] - MEAN1) / STD1 cur_f2 = (mt_feats_to_red[:, i] - MEAN2) / STD2 [res, P1] = aT.classifierWrapper(classifier_1, "knn", cur_f1) [res, P2] = aT.classifierWrapper(classifier_2, "knn", cur_f2) mt_feats_to_red_2[0:mt_feats_to_red.shape[0], i] = mt_feats_to_red[:, i] mt_feats_to_red_2[ mt_feats_to_red.shape[0]:mt_feats_to_red.shape[0] + len(classNames1), i] = P1 + 0.0001 mt_feats_to_red_2[mt_feats_to_red.shape[0] + len(classNames1)::, i] = P2 + 0.0001 mt_feats_to_red = mt_feats_to_red_2 mt_feats_to_red = mt_feats_to_red[iFeaturesSelect, :] #mt_feats_to_red += numpy.random.rand(mt_feats_to_red.shape[0], mt_feats_to_red.shape[1]) * 0.0000010 (mt_feats_to_red, MEAN, STD) = aT.normalizeFeatures([mt_feats_to_red.T]) mt_feats_to_red = mt_feats_to_red[0].T #dist_all = numpy.sum(distance.squareform(distance.pdist(mt_feats_to_red.T)), axis=0) #m_dist_all = numpy.mean(dist_all) #iNonOutLiers2 = numpy.nonzero(dist_all < 3.0*m_dist_all)[0] #mt_feats_to_red = mt_feats_to_red[:, iNonOutLiers2] Labels = numpy.zeros((mt_feats_to_red.shape[1], )) LDAstep = 1.0 LDAstepRatio = LDAstep / st_win #print LDAstep, LDAstepRatio for i in range(Labels.shape[0]): Labels[i] = int(i * st_win / LDAstepRatio) clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis( n_components=lda_dim) clf.fit(mt_feats_to_red.T, Labels) mt_feats_norm = (clf.transform(mt_feats_norm.T)).T if n_speakers <= 0: s_range = range(2, 10) else: s_range = [n_speakers] clsAll = [] sil_all = [] centersAll = [] for iSpeakers in s_range: k_means = sklearn.cluster.KMeans(n_clusters=iSpeakers) k_means.fit(mt_feats_norm.T) cls = k_means.labels_ means = k_means.cluster_centers_ # Y = distance.squareform(distance.pdist(mt_feats_norm.T)) clsAll.append(cls) centersAll.append(means) sil_1 = [] sil_2 = [] for c in range(iSpeakers): # for each speaker (i.e. for each extracted cluster) clust_per_cent = numpy.nonzero(cls == c)[0].shape[0] / \ float(len(cls)) if clust_per_cent < 0.020: sil_1.append(0.0) sil_2.append(0.0) else: # get subset of feature vectors mt_feats_norm_temp = mt_feats_norm[:, cls == c] # compute average distance between samples # that belong to the cluster (a values) Yt = distance.pdist(mt_feats_norm_temp.T) sil_1.append(numpy.mean(Yt) * clust_per_cent) silBs = [] for c2 in range(iSpeakers): # compute distances from samples of other clusters if c2 != c: clust_per_cent_2 = numpy.nonzero(cls == c2)[0].shape[0] /\ float(len(cls)) MidTermFeaturesNormTemp2 = mt_feats_norm[:, cls == c2] Yt = distance.cdist(mt_feats_norm_temp.T, MidTermFeaturesNormTemp2.T) silBs.append( numpy.mean(Yt) * (clust_per_cent + clust_per_cent_2) / 2.0) silBs = numpy.array(silBs) # ... and keep the minimum value (i.e. # the distance from the "nearest" cluster) sil_2.append(min(silBs)) sil_1 = numpy.array(sil_1) sil_2 = numpy.array(sil_2) sil = [] for c in range(iSpeakers): # for each cluster (speaker) compute silhouette sil.append( (sil_2[c] - sil_1[c]) / (max(sil_2[c], sil_1[c]) + 0.00001)) # keep the AVERAGE SILLOUETTE sil_all.append(numpy.mean(sil)) imax = numpy.argmax(sil_all) # optimal number of clusters nSpeakersFinal = s_range[imax] # generate the final set of cluster labels # (important: need to retrieve the outlier windows: # this is achieved by giving them the value of their # nearest non-outlier window) cls = numpy.zeros((n_wins, )) for i in range(n_wins): j = numpy.argmin(numpy.abs(i - i_non_outliers)) cls[i] = clsAll[imax][j] # Post-process method 1: hmm smoothing for i in range(1): # hmm training start_prob, transmat, means, cov = \ trainHMM_computeStatistics(mt_feats_norm_or, cls) hmm = hmmlearn.hmm.GaussianHMM(start_prob.shape[0], "diag") hmm.startprob_ = start_prob hmm.transmat_ = transmat hmm.means_ = means hmm.covars_ = cov cls = hmm.predict(mt_feats_norm_or.T) # Post-process method 2: median filtering: cls = scipy.signal.medfilt(cls, 13) cls = scipy.signal.medfilt(cls, 11) sil = sil_all[imax] class_names = ["speaker{0:d}".format(c) for c in range(nSpeakersFinal)] # load ground-truth if available gt_file = filename.replace('.wav', '.segments') # if groundturh exists if os.path.isfile(gt_file): [seg_start, seg_end, seg_labs] = readSegmentGT(gt_file) flags_gt, class_names_gt = segs2flags(seg_start, seg_end, seg_labs, mt_step) if plot_res: fig = plt.figure() if n_speakers > 0: ax1 = fig.add_subplot(111) else: ax1 = fig.add_subplot(211) ax1.set_yticks(numpy.array(range(len(class_names)))) ax1.axis((0, duration, -1, len(class_names))) ax1.set_yticklabels(class_names) ax1.plot(numpy.array(range(len(cls))) * mt_step + mt_step / 2.0, cls) if os.path.isfile(gt_file): if plot_res: ax1.plot( numpy.array(range(len(flags_gt))) * mt_step + mt_step / 2.0, flags_gt, 'r') purity_cluster_m, purity_speaker_m = \ evaluateSpeakerDiarization(cls, flags_gt) print("{0:.1f}\t{1:.1f}".format(100 * purity_cluster_m, 100 * purity_speaker_m)) if plot_res: plt.title("Cluster purity: {0:.1f}% - " "Speaker purity: {1:.1f}%".format( 100 * purity_cluster_m, 100 * purity_speaker_m)) if plot_res: plt.xlabel("time (seconds)") #print s_range, sil_all if n_speakers <= 0: plt.subplot(212) plt.plot(s_range, sil_all) plt.xlabel("number of clusters") plt.ylabel("average clustering's sillouette") plt.show() return cls
def trainHMM_fromDir(dirPath, hmmModelName, mtWin, mtStep): ''' This function trains a HMM model for segmentation-classification using a where WAV files and .segment (ground-truth files) are stored ARGUMENTS: - dirPath: the path of the data diretory - hmmModelName: the name of the HMM model to be stored - mtWin: mid-term window size - mtStep: mid-term window step RETURNS: - hmm: an object to the resulting HMM - classNames: a list of classNames After training, hmm, classNames, along with the mtWin and mtStep values are stored in the hmmModelName file ''' flagsAll = numpy.array([]) initializedFall = False classesAll = [] # for each WAV file for i, f in enumerate(glob.glob(dirPath + os.sep + '*.wav')): wavFile = f # open for annotated file gtFile = f.replace('.wav', '.segments') # if current WAV file does not have annotation -> skip if not os.path.isfile(gtFile): continue [segStart, segEnd, segLabels] = readSegmentGT( gtFile) # read GT data flags, classNames = segs2flags( segStart, segEnd, segLabels, mtStep) # convert to flags # update classnames: for c in classNames: if c not in classesAll: classesAll.append(c) [Fs, x] = audioBasicIO.readAudioFile( wavFile) # read audio data [F, _] = aF.mtFeatureExtraction( x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * 0.050), round(Fs * 0.050)) # feature extraction lenF = F.shape[1] lenL = len(flags) MIN = min(lenF, lenL) F = F[:, 0:MIN] flags = flags[0:MIN] flagsNew = [] for j, fl in enumerate(flags): # append features and labels flagsNew.append(classesAll.index(classNames[flags[j]])) flagsAll = numpy.append(flagsAll, numpy.array(flagsNew)) if not initializedFall: Fall = F initializedFall = True else: Fall = numpy.concatenate((Fall, F), axis=1) startprob, transmat, means, cov = trainHMM_computeStatistics( Fall, flagsAll) # compute HMM statistics hmm = hmmlearn.hmm.GaussianHMM(startprob.shape[0], "diag") # train HMM hmm.startprob_ = startprob hmm.transmat_ = transmat hmm.means_ = means hmm.covars_ = cov fo = open(hmmModelName, "wb") # save HMM model cPickle.dump(hmm, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(classesAll, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(mtWin, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(mtStep, fo, protocol=cPickle.HIGHEST_PROTOCOL) fo.close() return hmm, classesAll