Ejemplo n.º 1
0
def fileRegression(inputFile, model_name, model_type, feats=["gfcc", "mfcc"]):
    # Load classifier:

    if not os.path.isfile(inputFile):
        print("fileClassification: wav file not found!")
        return (-1, -1, -1)

    regression_models = glob.glob(model_name + "_*")
    regression_models2 = []
    for r in regression_models:
        if r[-5::] != "MEANS":
            regression_models2.append(r)
    regression_models = regression_models2
    regression_names = []
    for r in regression_models:
        regression_names.append(r[r.rfind("_") + 1::])

    # FEATURE EXTRACTION
    # LOAD ONLY THE FIRST MODEL (for mt_win, etc)
    if model_type == 'svm' or model_type == "svm_rbf" or model_type == 'randomforest':
        [_, _, _, mt_win, mt_step, st_win, st_step,
         compute_beat] = load_model(regression_models[0], True)

    [Fs, x] = audioBasicIO.readAudioFile(
        inputFile)  # read audio file and convert to mono
    x = audioBasicIO.stereo2mono(x)
    # feature extraction:
    [mt_features, s, _] = aF.mtFeatureExtraction(x, Fs, mt_win * Fs,
                                                 mt_step * Fs,
                                                 round(Fs * st_win),
                                                 round(Fs * st_step), feats)
    mt_features = mt_features.mean(
        axis=1)  # long term averaging of mid-term statistics
    if compute_beat:
        [beat, beatConf] = aF.beatExtraction(s, st_step)
        mt_features = numpy.append(mt_features, beat)
        mt_features = numpy.append(mt_features, beatConf)

    # REGRESSION
    R = []
    for ir, r in enumerate(regression_models):
        if not os.path.isfile(r):
            print("fileClassification: input model_name not found!")
            return (-1, -1, -1)
        if model_type == 'svm' or model_type == "svm_rbf" \
                or model_type == 'randomforest':
            [model, MEAN, STD, mt_win, mt_step, st_win, st_step, compute_beat] = \
                load_model(r, True)
        curFV = (mt_features - MEAN) / STD  # normalization
        R.append(regressionWrapper(model, model_type, curFV))  # classification
    return R, regression_names
Ejemplo n.º 2
0
    def __init__(self):
        self.BEAM_WIDTH = 500
        self.LM_ALPHA = 0.75
        self.LM_BETA = 1.85
        self.model_dir = 'DeepSpeech/data/wernicke/model/'
        self.model_file = os.path.join(self.model_dir, 'output_graph.pb')
        # self.model_dir = 'deepspeech-0.6.0-models/'
        # self.model_file = os.path.join(self.model_dir, 'output_graph.pbmm')
        self.lm_file = os.path.join(self.model_dir, 'lm.binary')
        self.trie_file = os.path.join(self.model_dir, 'trie')

        self.save_dir = 'saved_wavs'
        os.makedirs(self.save_dir, exist_ok=True)

        # load segment model
        log.info('Initializing pyAudioAnalysis classifier model...')
        [
            self.classifier, self.MEAN, self.STD, self.class_names,
            self.mt_win, self.mt_step, self.st_win, self.st_step, _
        ] = aT.load_model("wernicke_server_model")
        self.fs = 16000

        log.info('Initializing deepspeech model...')
        self.model = deepspeech.Model(self.model_file, self.BEAM_WIDTH)
        # Temporarily disabling this. I don't think I have nearly enough samples to start doing LM and trie files, etc
        self.model.enableDecoderWithLM(self.lm_file, self.trie_file,
                                       self.LM_ALPHA, self.LM_BETA)

        log.info('Models ready.')
Ejemplo n.º 3
0
def classifyFolderWrapper(inputFolder, modelType, modelName, outputMode=False):
	if not os.path.isfile(modelName):
		raise Exception("Input modelName not found!")

	if modelType=='svm':
		[Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, compute_beat] = aT.load_model(modelName)
	elif modelType=='knn':
		[Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, compute_beat] = aT.load_model_knn(modelName)

	PsAll = numpy.zeros((len(classNames), ))	
		
	files = "*.wav"
	if os.path.isdir(inputFolder):
		strFilePattern = os.path.join(inputFolder, files)
	else:
		strFilePattern = inputFolder + files

	wavFilesList = []
	wavFilesList.extend(glob.glob(strFilePattern))
	wavFilesList = sorted(wavFilesList)
	if len(wavFilesList)==0:
		print ("No WAV files found!")
		return 
	
	Results = []
	for wavFile in wavFilesList:	
		[Fs, x] = audioBasicIO.readAudioFile(wavFile)	
		signalLength = x.shape[0] / float(Fs)
		[Result, P, classNames] = aT.fileClassification(wavFile, modelName, modelType)					
		PsAll += (numpy.array(P) * signalLength)		
		Result = int(Result)
		Results.append(Result)
		if outputMode:
			print ("{0:s}\t{1:s}".format(wavFile,classNames[Result]))
	Results = numpy.array(Results)
	
	# print distribution of classes:
	[Histogram, _] = numpy.histogram(Results, bins=numpy.arange(len(classNames)+1))
	if outputMode:	
		for i,h in enumerate(Histogram):
			print( "{0:20s}\t\t{1:d}".format(classNames[i], h))
	PsAll = PsAll / numpy.sum(PsAll)


	if outputMode:	
		fig = plt.figure()
		ax = fig.add_subplot(111)
		plt.title("Classes percentage " + inputFolder.replace('Segments',''))
		ax.axis((0, len(classNames)+1, 0, 1))
		ax.set_xticks(numpy.array(range(len(classNames)+1)))
		ax.set_xticklabels([" "] + classNames)
		ax.bar(numpy.array(range(len(classNames)))+0.5, PsAll)
		plt.show()
	return classNames, PsAll
Ejemplo n.º 4
0
def fileClassification(inputFile,
                       model_name,
                       model_type,
                       feats=["gfcc", "mfcc"]):
    # Load classifier:

    if not os.path.isfile(model_name):
        print("fileClassification: input model_name not found!")
        return (-1, -1, -1)

    if not os.path.isfile(inputFile):
        print("fileClassification: wav file not found!")
        return (-1, -1, -1)

    if model_type == 'knn':
        [
            classifier, MEAN, STD, classNames, mt_win, mt_step, st_win,
            st_step, compute_beat
        ] = load_model_knn(model_name)
    else:
        [
            classifier, MEAN, STD, classNames, mt_win, mt_step, st_win,
            st_step, compute_beat
        ] = load_model(model_name)

    [Fs, x] = audioBasicIO.readAudioFile(
        inputFile)  # read audio file and convert to mono
    x = audioBasicIO.stereo2mono(x)

    if isinstance(x, int):  # audio file IO problem
        return (-1, -1, -1)
    if x.shape[0] / float(Fs) <= mt_win:
        return (-1, -1, -1)

    # feature extraction:
    [mt_features, s, _] = aF.mtFeatureExtraction(x, Fs, mt_win * Fs,
                                                 mt_step * Fs,
                                                 round(Fs * st_win),
                                                 round(Fs * st_step), feats)
    mt_features = mt_features.mean(
        axis=1)  # long term averaging of mid-term statistics
    if compute_beat:
        [beat, beatConf] = aF.beatExtraction(s, st_step)
        mt_features = numpy.append(mt_features, beat)
        mt_features = numpy.append(mt_features, beatConf)
    curFV = (mt_features - MEAN) / STD  # normalization

    [Result, P] = classifierWrapperHead(classifier, model_type,
                                        curFV)  # classification
    return Result, P, classNames
Ejemplo n.º 5
0
def getMusicSegmentsFromFile(inputFile):	
	modelType = "svm"
	modelName = "data/svmMovies8classes"
	
	dirOutput = inputFile[0:-4] + "_musicSegments"
	
	if os.path.exists(dirOutput) and dirOutput!=".":
		shutil.rmtree(dirOutput)	
	os.makedirs(dirOutput)	
	
	[Fs, x] = audioBasicIO.readAudioFile(inputFile)	

	if modelType=='svm':
		[Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, compute_beat] = aT.load_model(modelName)
	elif modelType=='knn':
		[Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, compute_beat] = aT.load_model_knn(modelName)

	flagsInd, classNames, acc, CM = aS.mtFileClassification(inputFile, modelName, modelType, plotResults = False, gtFile = "")
	segs, classes = aS.flags2segs(flagsInd, mtStep)

	for i, s in enumerate(segs):
		if (classNames[int(classes[i])] == "Music") and (s[1] - s[0] >= minDuration):
			strOut = "{0:s}{1:.3f}-{2:.3f}.wav".format(dirOutput+os.sep, s[0], s[1])	
			wavfile.write( strOut, Fs, x[int(Fs*s[0]):int(Fs*s[1])])
Ejemplo n.º 6
0
def speaker_diarization(filename, n_speakers, mid_window=1.0, mid_step=0.1,
                        short_window=0.1, lda_dim=0, plot_res=False):
    """
    ARGUMENTS:
        - filename:        the name of the WAV file to be analyzed
        - n_speakers       the number of speakers (clusters) in
                           the recording (<=0 for unknown)
        - mid_window (opt)    mid-term window size
        - mid_step (opt)    mid-term window step
        - short_window  (opt)    short-term window size
        - lda_dim (opt     LDA dimension (0 for no LDA)
        - plot_res         (opt)   0 for not plotting the results 1 for plotting
    """
    sampling_rate, signal = audioBasicIO.read_audio_file(filename)
    signal = audioBasicIO.stereo_to_mono(signal)
    duration = len(signal) / sampling_rate

    base_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                            "data/models")

    classifier_all, mean_all, std_all, class_names_all, _, _, _, _, _ = \
        at.load_model(os.path.join(base_dir, "svm_rbf_speaker_10"))
    classifier_fm, mean_fm, std_fm, class_names_fm, _, _, _, _,  _ = \
        at.load_model(os.path.join(base_dir, "svm_rbf_speaker_male_female"))


    mid_feats, st_feats, a = \
        mtf.mid_feature_extraction(signal, sampling_rate,
                                   mid_window * sampling_rate,
                                   mid_step * sampling_rate,
                                   round(sampling_rate * 0.05),
                                   round(sampling_rate * 0.05))

    mid_term_features = np.zeros((mid_feats.shape[0] + len(class_names_all) +
                                  len(class_names_fm), mid_feats.shape[1]))
    for index in range(mid_feats.shape[1]):
        feature_norm_all = (mid_feats[:, index] - mean_all) / std_all
        feature_norm_fm = (mid_feats[:, index] - mean_fm) / std_fm
        _, p1 = at.classifier_wrapper(classifier_all, "svm_rbf", feature_norm_all)
        _, p2 = at.classifier_wrapper(classifier_fm, "svm_rbf", feature_norm_fm)
        start = mid_feats.shape[0]
        end = mid_feats.shape[0] + len(class_names_all)
        mid_term_features[0:mid_feats.shape[0], index] = mid_feats[:, index]
        mid_term_features[start:end, index] = p1 + 1e-4
        mid_term_features[end::, index] = p2 + 1e-4
    # normalize features:
    scaler = StandardScaler()
    mid_feats_norm = scaler.fit_transform(mid_term_features.T)

    # remove outliers:
    dist_all = np.sum(distance.squareform(distance.pdist(mid_feats_norm.T)),
                      axis=0)
    m_dist_all = np.mean(dist_all)
    i_non_outliers = np.nonzero(dist_all < 1.1 * m_dist_all)[0]

    # TODO: Combine energy threshold for outlier removal:
    # EnergyMin = np.min(mt_feats[1,:])
    # EnergyMean = np.mean(mt_feats[1,:])
    # Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0
    # i_non_outliers = np.nonzero(mt_feats[1,:] > Thres)[0]
    # print i_non_outliers

    mt_feats_norm_or = mid_feats_norm
    mid_feats_norm = mid_feats_norm[:, i_non_outliers]

    # LDA dimensionality reduction:
    if lda_dim > 0:

        # extract mid-term features with minimum step:
        window_ratio = int(round(mid_window / short_window))
        step_ratio = int(round(short_window / short_window))
        mt_feats_to_red = []
        num_of_features = len(st_feats)
        num_of_stats = 2
        for index in range(num_of_stats * num_of_features):
            mt_feats_to_red.append([])

        # for each of the short-term features:
        for index in range(num_of_features):
            cur_pos = 0
            feat_len = len(st_feats[index])
            while cur_pos < feat_len:
                n1 = cur_pos
                n2 = cur_pos + window_ratio
                if n2 > feat_len:
                    n2 = feat_len
                short_features = st_feats[index][n1:n2]
                mt_feats_to_red[index].append(np.mean(short_features))
                mt_feats_to_red[index + num_of_features].\
                    append(np.std(short_features))
                cur_pos += step_ratio
        mt_feats_to_red = np.array(mt_feats_to_red)
        mt_feats_to_red_2 = np.zeros((mt_feats_to_red.shape[0] +
                                      len(class_names_all) +
                                      len(class_names_fm),
                                      mt_feats_to_red.shape[1]))
        limit = mt_feats_to_red.shape[0] + len(class_names_all)
        for index in range(mt_feats_to_red.shape[1]):
            feature_norm_all = (mt_feats_to_red[:, index] - mean_all) / std_all
            feature_norm_fm = (mt_feats_to_red[:, index] - mean_fm) / std_fm
            _, p1 = at.classifier_wrapper(classifier_all, "svm_rbf",
                                          feature_norm_all)
            _, p2 = at.classifier_wrapper(classifier_fm, "svm_rbf", feature_norm_fm)
            mt_feats_to_red_2[0:mt_feats_to_red.shape[0], index] = \
                mt_feats_to_red[:, index]
            mt_feats_to_red_2[mt_feats_to_red.shape[0]:limit, index] = p1 + 1e-4
            mt_feats_to_red_2[limit::, index] = p2 + 1e-4
        mt_feats_to_red = mt_feats_to_red_2
        scaler = StandardScaler()
        mt_feats_to_red = scaler.fit_transform(mt_feats_to_red.T).T
        labels = np.zeros((mt_feats_to_red.shape[1], ))
        lda_step = 1.0
        lda_step_ratio = lda_step / short_window
        for index in range(labels.shape[0]):
            labels[index] = int(index * short_window / lda_step_ratio)
        clf = sklearn.discriminant_analysis.\
            LinearDiscriminantAnalysis(n_components=lda_dim)
        mid_feats_norm = clf.fit_transform(mt_feats_to_red.T, labels)
        #clf.fit(mt_feats_to_red.T, labels)
        #mid_feats_norm = (clf.transform(mid_feats_norm.T)).T
    if n_speakers <= 0:
        s_range = range(2, 10)
    else:
        s_range = [n_speakers]
    cluster_labels = []
    sil_all = []
    cluster_centers = []
    
    for speakers in s_range:
        k_means = sklearn.cluster.KMeans(n_clusters=speakers)
        k_means.fit(mid_feats_norm)
        cls = k_means.labels_ 
        cluster_labels.append(cls)
#        cluster_centers.append(means)
        sil_1 = []; sil_2 = []
        for c in range(speakers):
            # for each speaker (i.e. for each extracted cluster)
            clust_per_cent = np.nonzero(cls == c)[0].shape[0] / float(len(cls))
            if clust_per_cent < 0.020:
                sil_1.append(0.0)
                sil_2.append(0.0)
            else:
                # get subset of feature vectors
                mt_feats_norm_temp = mid_feats_norm[cls == c, :]
                # compute average distance between samples
                # that belong to the cluster (a values)
                dist = distance.pdist(mt_feats_norm_temp.T)
                sil_1.append(np.mean(dist)*clust_per_cent)
                sil_temp = []
                for c2 in range(speakers):
                    # compute distances from samples of other clusters
                    if c2 != c:
                        clust_per_cent_2 = np.nonzero(cls == c2)[0].shape[0] /\
                                           float(len(cls))
                        mid_features_temp = mid_feats_norm[cls == c2, :]
                        dist = distance.cdist(mt_feats_norm_temp,
                                              mid_features_temp)
                        sil_temp.append(np.mean(dist)*(clust_per_cent
                                                       + clust_per_cent_2)/2.0)
                sil_temp = np.array(sil_temp)
                # ... and keep the minimum value (i.e.
                # the distance from the "nearest" cluster)
                sil_2.append(min(sil_temp))
        sil_1 = np.array(sil_1)
        sil_2 = np.array(sil_2)
        sil = []
        for c in range(speakers):
            # for each cluster (speaker) compute silhouette
            sil.append((sil_2[c] - sil_1[c]) / (max(sil_2[c], sil_1[c]) + 1e-5))
        # keep the AVERAGE SILLOUETTE
        sil_all.append(np.mean(sil))

    imax = int(np.argmax(sil_all))
    # optimal number of clusters
    num_speakers = s_range[imax]

    # generate the final set of cluster labels
    # (important: need to retrieve the outlier windows:
    # this is achieved by giving them the value of their
    # nearest non-outlier window)
#    print(cls)
#    cls = np.zeros((n_wins,))
#    for index in range(n_wins):
#        j = np.argmin(np.abs(index-i_non_outliers))
#        cls[index] = cluster_labels[imax][j]
    # Post-process method 1: hmm smoothing
    if lda_dim <= 0 :
        for index in range(1):
            # hmm training
            start_prob, transmat, means, cov = \
                train_hmm_compute_statistics(mt_feats_norm_or.T, cls)
            hmm = hmmlearn.hmm.GaussianHMM(start_prob.shape[0], "diag")
            hmm.startprob_ = start_prob
            hmm.transmat_ = transmat            
            hmm.means_ = means; hmm.covars_ = cov
            cls = hmm.predict(mt_feats_norm_or)                        
    # Post-process method 2: median filtering:
    cls = scipy.signal.medfilt(cls, 5)

    class_names = ["speaker{0:d}".format(c) for c in range(num_speakers)]

    # load ground-truth if available
    gt_file = filename.replace('.wav', '.segments')
    # if groundtruth exists
    if os.path.isfile(gt_file):
        seg_start, seg_end, seg_labs = read_segmentation_gt(gt_file)
        flags_gt, class_names_gt = segments_to_labels(seg_start, seg_end,
                                                      seg_labs, mid_step)

    if plot_res:
        fig = plt.figure()    
        if n_speakers > 0:
            ax1 = fig.add_subplot(111)
        else:
            ax1 = fig.add_subplot(211)
        ax1.set_yticks(np.array(range(len(class_names))))
        ax1.axis((0, duration, -1, len(class_names)))
        ax1.set_yticklabels(class_names)
        ax1.plot(np.array(range(len(cls))) * mid_step + mid_step / 2.0, cls)

    purity_cluster_m, purity_speaker_m = -1, -1
    if os.path.isfile(gt_file):
        if plot_res:
            ax1.plot(np.array(range(len(flags_gt))) *
                     mid_step + mid_step / 2.0, flags_gt, 'r')
        purity_cluster_m, purity_speaker_m = \
            evaluate_speaker_diarization(cls, flags_gt)
        print("{0:.1f}\t{1:.1f}".format(100 * purity_cluster_m,
                                        100 * purity_speaker_m))
        if plot_res:
            plt.title("Cluster purity: {0:.1f}% - "
                      "Speaker purity: {1:.1f}%".format(100 * purity_cluster_m,
                                                        100 * purity_speaker_m))
    if plot_res:
        plt.xlabel("time (seconds)")
        if n_speakers <= 0:
            plt.subplot(212)
            plt.plot(s_range, sil_all)
            plt.xlabel("number of clusters")
            plt.ylabel("average clustering's sillouette")
        plt.show()
    return cls, purity_cluster_m, purity_speaker_m
Ejemplo n.º 7
0
def mtFileClassification(input_file,
                         model_name,
                         model_type,
                         plot_results=False,
                         gt_file=""):
    '''
    This function performs mid-term classification of an audio stream.
    Towards this end, supervised knowledge is used, i.e. a pre-trained classifier.
    ARGUMENTS:
        - input_file:        path of the input WAV file
        - model_name:        name of the classification model
        - model_type:        svm or knn depending on the classifier type
        - plot_results:      True if results are to be plotted using
                             matplotlib along with a set of statistics

    RETURNS:
          - segs:           a sequence of segment's endpoints: segs[i] is the
                            endpoint of the i-th segment (in seconds)
          - classes:        a sequence of class flags: class[i] is the
                            class ID of the i-th segment
    '''

    if not os.path.isfile(model_name):
        print("mtFileClassificationError: input model_type not found!")
        return (-1, -1, -1, -1)
    # Load classifier:
    if model_type == "knn":
        [classifier, MEAN, STD, class_names, mt_win, mt_step, st_win, st_step, compute_beat] = \
            aT.load_model_knn(model_name)
    else:
        [
            classifier, MEAN, STD, class_names, mt_win, mt_step, st_win,
            st_step, compute_beat
        ] = aT.load_model(model_name)

    if compute_beat:
        print("Model " + model_name + " contains long-term music features "
              "(beat etc) and cannot be used in "
              "segmentation")
        return (-1, -1, -1, -1)
    [fs, x] = audioBasicIO.readAudioFile(input_file)  # load input file
    if fs == -1:  # could not read file
        return (-1, -1, -1, -1)
    x = audioBasicIO.stereo2mono(x)  # convert stereo (if) to mono
    duration = len(x) / fs
    # mid-term feature extraction:
    [mt_feats, _, _] = aF.mtFeatureExtraction(x, fs, mt_win * fs, mt_step * fs,
                                              round(fs * st_win),
                                              round(fs * st_step))
    flags = []
    Ps = []
    flags_ind = []
    for i in range(
            mt_feats.shape[1]
    ):  # for each feature vector (i.e. for each fix-sized segment):
        cur_fv = (mt_feats[:, i] -
                  MEAN) / STD  # normalize current feature vector
        [res, P] = aT.classifierWrapper(classifier, model_type,
                                        cur_fv)  # classify vector
        flags_ind.append(res)
        flags.append(class_names[int(res)])  # update class label matrix
        Ps.append(numpy.max(P))  # update probability matrix
    flags_ind = numpy.array(flags_ind)

    # 1-window smoothing
    for i in range(1, len(flags_ind) - 1):
        if flags_ind[i - 1] == flags_ind[i + 1]:
            flags_ind[i] = flags_ind[i + 1]
    # convert fix-sized flags to segments and classes
    (segs, classes) = flags2segs(flags, mt_step)
    segs[-1] = len(x) / float(fs)

    # Load grount-truth:
    if os.path.isfile(gt_file):
        [seg_start_gt, seg_end_gt, seg_l_gt] = readSegmentGT(gt_file)
        flags_gt, class_names_gt = segs2flags(seg_start_gt, seg_end_gt,
                                              seg_l_gt, mt_step)
        flags_ind_gt = []
        for j, fl in enumerate(flags_gt):
            # "align" labels with GT
            if class_names_gt[flags_gt[j]] in class_names:
                flags_ind_gt.append(
                    class_names.index(class_names_gt[flags_gt[j]]))
            else:
                flags_ind_gt.append(-1)
        flags_ind_gt = numpy.array(flags_ind_gt)
        cm = numpy.zeros((len(class_names_gt), len(class_names_gt)))
        for i in range(min(flags_ind.shape[0], flags_ind_gt.shape[0])):
            cm[int(flags_ind_gt[i]), int(flags_ind[i])] += 1
    else:
        cm = []
        flags_ind_gt = numpy.array([])
    acc = plotSegmentationResults(flags_ind, flags_ind_gt, class_names,
                                  mt_step, not plot_results)
    if acc >= 0:
        print("Overall Accuracy: {0:.3f}".format(acc))
        return (flags_ind, class_names_gt, acc, cm)
    else:
        return (flags_ind, class_names, acc, cm)
Ejemplo n.º 8
0
def mid_term_file_classification(input_file,
                                 model_name,
                                 model_type,
                                 plot_results=False,
                                 gt_file=""):
    """
    This function performs mid-term classification of an audio stream.
    Towards this end, supervised knowledge is used,
    i.e. a pre-trained classifier.
    ARGUMENTS:
        - input_file:        path of the input WAV file
        - model_name:        name of the classification model
        - model_type:        svm or knn depending on the classifier type
        - plot_results:      True if results are to be plotted using
                             matplotlib along with a set of statistics
    RETURNS:
          - segs:           a sequence of segment's endpoints: segs[i] is the
                            endpoint of the i-th segment (in seconds)
          - classes:        a sequence of class flags: class[i] is the
                            class ID of the i-th segment
    """
    labels = []
    accuracy = 0.0
    class_names = []
    cm = np.array([])
    if not os.path.isfile(model_name):
        print("mtFileClassificationError: input model_type not found!")
        return labels, class_names, accuracy, cm

    # Load classifier:
    if model_type == "knn":
        classifier, mean, std, class_names, mt_win, mid_step, st_win, \
         st_step, compute_beat = at.load_model_knn(model_name)
    else:
        classifier, mean, std, class_names, mt_win, mid_step, st_win, \
         st_step, compute_beat = at.load_model(model_name)
    if compute_beat:
        print("Model " + model_name + " contains long-term music features "
              "(beat etc) and cannot be used in "
              "segmentation")
        return labels, class_names, accuracy, cm
    # load input file
    sampling_rate, signal = audioBasicIO.read_audio_file(input_file)

    # could not read file
    if sampling_rate == 0:
        return labels, class_names, accuracy, cm

    # convert stereo (if) to mono
    signal = audioBasicIO.stereo_to_mono(signal)

    # mid-term feature extraction:
    mt_feats, _, _ = \
        mtf.mid_feature_extraction(signal, sampling_rate,
                                   mt_win * sampling_rate,
                                   mid_step * sampling_rate,
                                   round(sampling_rate * st_win),
                                   round(sampling_rate * st_step))
    posterior_matrix = []

    # for each feature vector (i.e. for each fix-sized segment):
    for col_index in range(mt_feats.shape[1]):
        # normalize current feature v
        feature_vector = (mt_feats[:, col_index] - mean) / std

        # classify vector:
        label_predicted, posterior = \
            at.classifier_wrapper(classifier, model_type, feature_vector)
        labels.append(label_predicted)

        # update probability matrix
        posterior_matrix.append(np.max(posterior))
    labels = np.array(labels)

    # convert fix-sized flags to segments and classes
    segs, classes = labels_to_segments(labels, mid_step)
    segs[-1] = len(signal) / float(sampling_rate)
    # Load grount-truth:
    labels_gt, class_names_gt, accuracy, cm = \
        load_ground_truth(gt_file, labels, class_names, mid_step, plot_results)

    return labels, class_names, accuracy, cm
Ejemplo n.º 9
0
from google.cloud.speech import enums
from google.cloud.speech import types

classes = ['speech', 'music']
wf = wave.open('./data/01_Radioaufnahmen_Musik_Jingle_Sprache_mono2.wav', 'rb')

print('sample rate: ' + str(wf.getframerate() / 1000.0) + 'kHz')
print('channels: ' + str(wf.getnchannels()))

print('initialize pocketsphinx...')
sd = SpeechDetector.SpeechDetector()
print('... done.')

print('initialize speech/music classifier...')
[Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep,
 computeBEAT] = aT.load_model("./data/speech_music_classifier/svmSM")
modelType = "svm"
print('... done.')

print('initialize genre classifier...')
genre_recognizer = GenreRecognizer.GenreRecognizer(
    './data/genre_classifier/model.yaml', './data/genre_classifier/weights.h5')
print('... done.')

p = pyaudio.PyAudio()
stream = p.open(format=p.get_format_from_width(wf.getsampwidth()),
                channels=wf.getnchannels(),
                rate=wf.getframerate(),
                output=True)
import time
Ejemplo n.º 10
0
def vadFolderWrapperMergedByTh(inputFolder, outFolder, smoothingWindow, weight, model_name, threshold):

    if not os.path.isfile(model_name):
        print("fileClassification: input model_name not found!")



    classifier, mean, std, classes, mid_window, mid_step, short_window, \
    short_step, compute_beat = aT.load_model(model_name)

    types = ('*.wav', '*.mp3')

    wavFilesList = []
    for files in types:
        print(inputFolder + files)
        wavFilesList.extend(glob.glob((inputFolder + files)))
    wavFilesList = sorted(wavFilesList)
    if len(wavFilesList) == 0:
        print("No WAV files found!")
        return
    for wavFile in wavFilesList:
        # print(wavFile)
        if not os.path.isfile(wavFile):
            raise Exception("Input audio file not found!")
        base = os.path.splitext(os.path.basename(wavFile))[0]
        folder = outFolder + base + '/'
        if not os.path.exists(folder):
            os.makedirs(folder)
        segfile = open(os.path.join(folder, 'segments'), 'w+')
        segfile2 = open(os.path.join(folder, 'segments_details'), 'w+')

        stack = deque()

        [fs, x] = audioBasicIO.read_audio_file(wavFile)
        segmentLimits = aS.silence_removal(x, fs, 0.05, 0.05, smoothingWindow, weight, False)
        merge=True

        for i, st in enumerate(segmentLimits):


            signal = audioBasicIO.stereo_to_mono(x[int(fs * st[0]):int(fs * st[1])])
            # print('in here', len(segmentLimits), st[0],st[1],classes, type(st))
            if fs == 0:
                continue
                # audio file IO problem
                # return -1, -1, -1

            if signal.shape[0] / float(fs) < mid_window:
                mid_window = signal.shape[0] / float(fs)

            # feature extraction:
            mid_features, s, _ = \
                aF.mid_feature_extraction(signal, fs,
                                          mid_window * fs,
                                          mid_step * fs,
                                          round(fs * short_window),
                                          round(fs * short_step))
            # long term averaging of mid-term statistics
            mid_features = mid_features.mean(axis=1)
            if compute_beat:
                # print('in here3')
                beat, beat_conf = aF.beat_extraction(s, short_step)
                mid_features = np.append(mid_features, beat)
                mid_features = np.append(mid_features, beat_conf)
            feature_vector = (mid_features - mean) / std  # normalization
            # class_id = -1
            # probability = -1
            class_id = classifier.predict(feature_vector.reshape(1, -1))[0]
            # probability = classifier.predict_proba(feature_vector.reshape(1, -1))[0]
            print(class_id, type(class_id))
            label=classes[int(class_id)]

            print(label)
            if label=='speech':
                dur=st[1]-st[0]
                # print('in hereas')
                if merge == True:
                    seg_prev=[]
                    # print('in hereasq12')
                    if len(stack) >0:
                        seg_prev = stack.pop()


                    if len(seg_prev) >0 and st[1]-seg_prev[0] > threshold:
                        # print('in hereas4')
                        seg = [st[0], st[1], label]
                        stack.append(seg_prev)
                        stack.append(seg)
                        merge = True
                    elif len(seg_prev) >0:
                        # print('in hereasqw345')
                        seg = [seg_prev[0], st[1], label]
                        stack.append(seg)
                        merge = True
                    else:
                        seg = [st[0], st[1], label]
                        stack.append(seg)
                        merge = True
                else:
                    # print('in hereas2')
                    seg = [st[0], st[1], label]
                    stack.append(seg)
                    merge = True

            else:
                merge = False
            print(i, merge)
        # print(len(segmentLimits), len(stack))
        for sn in stack:
            # print(type(wavFile), sn[0].shape, sn[1].shape, type(sn[0]), type(sn[1]))

            strName = base + "_" + "{:.3f}".format(sn[0]) + "_" + "{:.3f}".format(sn[1])
            if sn[2] == 'speech':
                strOut = folder + base + "_" + "{:.3f}".format(sn[0]) + "_" + "{:.3f}".format(sn[1]) + ".wav"

                wavfile.write(strOut, fs, x[int(fs * sn[0]):int(fs * sn[1])])
                segfile.write(strName + ' ' + base + ' ' + "{:.3f}".format(sn[0]) + ' ' + "{:.3f}".format(sn[1]) + "\n")
            segfile2.write(strName + ' ' + "{:.3f}".format(sn[0]) + ' ' + "{:.3f}".format(sn[1]) + ' ' + sn[2] + "\n")
    segfile.close()
    segfile2.close()
Ejemplo n.º 11
0
def FileClassification(input_file, model_name, model_type, gt=False,
                       gt_file=""):
    '''
    TODO: This function needs to be refactored according to the code in
    audioSegmentation.mid_term_file_classification()
    '''

    if not os.path.isfile(model_name):
        print("mtFileClassificationError: input model_type not found!")
        return (-1, -1, -1, -1)

    # Load classifier with load_model:
    [classifier, MEAN, STD, class_names, mt_win, mt_step, st_win, st_step,
     compute_beat] = aT.load_model(model_name)

    # Using audioBasicIO from puAudioAnalysis, the input audio stream is loaded
    [fs, x] = audioBasicIO.read_audio_file(input_file)
    if fs == -1:  # could not read file
        return (-1, -1, -1, -1)
    x = audioBasicIO.stereo_to_mono(x)  # convert stereo (if) to mono
    duration = len(x) / fs

    # mid-term feature extraction using pyAudioAnalysis mtFeatureExtraction:
    [mt_feats, _, _] = mF.mid_feature_extraction(x, fs, mt_win * fs,
                                                 mt_step * fs,
                                                 round(fs * st_win),
                                                 round(fs * st_step))
    flags = []
    Ps = []
    flags_ind = []
    for i in range(mt_feats.shape[1]):
        # for each feature vector (i.e. for each fix-sized segment):
        cur_fv = (mt_feats[:, i] - MEAN) / STD
        [res, P] = aT.classifier_wrapper(classifier, model_type, cur_fv)
        if res == 0.0:
            if numpy.max(P) > 0.5:
                flags_ind.append(res)
                flags.append(class_names[int(res)])  # update class label matrix
                Ps.append(numpy.max(P))  # update probability matrix
            else:
                flags_ind.append(-1)
                flags.append('None')
                Ps.append(-1)
        if res == 1.0:
            if numpy.max(P) > 0.9:
                flags_ind.append(res)
                flags.append(class_names[int(res)])  # update class label matrix
                Ps.append(numpy.max(P))  # update probability matrix
            else:
                flags_ind.append(-1)
                flags.append('None')
                Ps.append(-1)
        if res == 2.0:
            if numpy.max(P) > 0.6:
                flags_ind.append(res)
                flags.append(class_names[int(res)])  # update class label matrix
                Ps.append(numpy.max(P))  # update probability matrix
            else:
                flags_ind.append(-1)
                flags.append('None')
                Ps.append(-1)
        if res == 3.0:
            if numpy.max(P) > 0.3:
                flags_ind.append(res)
                flags.append(class_names[int(res)])  # update class label matrix
                Ps.append(numpy.max(P))  # update probability matrix
            else:
                flags_ind.append(-1)
                flags.append('None')
                Ps.append(-1)
        if res == 4.0:
            if numpy.max(P) > 0.3:
                flags_ind.append(res)
                flags.append(class_names[int(res)])  # update class label matrix
                Ps.append(numpy.max(P))  # update probability matrix
            else:
                flags_ind.append(-1)
                flags.append('None')
                Ps.append(-1)
    flags_ind = numpy.array(flags_ind)

    # 1-window smoothing
    for i in range(1, len(flags_ind) - 1):
        if flags_ind[i - 1] == flags_ind[i + 1]:
            flags_ind[i] = flags_ind[i + 1]
    # convert fix-sized flags to segments and classes
    (segs, classes) = aS.labels_to_segments(flags, mt_step)
    segs[-1] = len(x) / float(fs)
    if gt == True:
        # Load grount-truth:
        if os.path.isfile(gt_file):
            [seg_start_gt, seg_end_gt, seg_l_gt] = aS.read_segmentation_gt(gt_file)
            flags_gt, class_names_gt = aS.segments_to_labels(seg_start_gt, seg_end_gt, seg_l_gt, mt_step)
            flags_ind_gt = []
            # print(class_names)
            for j, fl in enumerate(flags_gt):
                # "align" labels with GT
                # print(class_names_gt[flags_gt[j]])
                if class_names_gt[flags_gt[j]] in class_names:
                    flags_ind_gt.append(class_names.index(class_names_gt[flags_gt[j]]))
                else:
                    flags_ind_gt.append(-1)
            flags_ind_gt = numpy.array(flags_ind_gt)
            cm = numpy.zeros((len(class_names_gt), len(class_names_gt)))
            for i in range(min(flags_ind.shape[0], flags_ind_gt.shape[0])):
                cm[int(flags_ind_gt[i]), int(flags_ind[i])] += 1
        else:
            cm = []
            flags_ind_gt = numpy.array([])
        acc = aS.plot_segmentation_results(flags_ind, flags_ind_gt,
                                           class_names, mt_step, False)
    else:
        cm = []
        flags_ind_gt = numpy.array([])
        acc = aS.plot_segmentation_results(flags_ind, flags_ind_gt,
                                           class_names, mt_step, False)
    if acc >= 0:
        print("Overall Accuracy: {0:.3f}".format(acc))
        return (flags_ind, class_names_gt, acc, cm)
    else:
        return (flags_ind, class_names, acc, cm)
def get_coordinates_from_audio(block, rms_min_max=[0, 25000]):
    mid_buf = []
    global all_data
    global outstr
    all_data = []
    outstr = datetime.datetime.now().strftime("%Y_%m_%d_%I:%M%p")

    # load segment model
    [classifier, mu, std, class_names, mt_win, mt_step, st_win, st_step,
     _] = aT.load_model("model")

    [clf_energy, mu_energy, std_energy, class_names_energy,
     mt_win_en, mt_step_en, st_win_en, st_step_en, _] = \
        aT.load_model("energy")

    [clf_valence, mu_valence, std_valence, class_names_valence,
     mt_win_va, mt_step_va, st_win_va, st_step_va, _] = \
        aT.load_model("valence")

    count_b = len(block) / 2
    format_h = "%dh" % (count_b)
    shorts = struct.unpack(format_h, block)
    cur_win = list(shorts)
    mid_buf = mid_buf + cur_win
    del cur_win
    # data-driven time
    x = numpy.int16(mid_buf)
    seg_len = len(x)
    r = audioop.rms(x, 2)
    if r < rms_min_max[0]:
        # set new min incase the default value is exceded
        rms_min_max[0] = r
    if r > rms_min_max[1]:
        # set new max incase the default value is exceded
        rms_min_max[1] = r
    r_norm = float(r - rms_min_max[0]) / float(rms_min_max[1] - rms_min_max[0])
    r_map = int(r_norm * 255)
    print(
        f'RMS: {r}; MIN: {rms_min_max[0]}; MAX: {rms_min_max[1]}; NORM: {r_norm}; MAP: {r_map}'
    )
    # extract features
    # We are using the signal length as mid term window and step,
    # in order to guarantee a mid-term feature sequence of len 1
    [mt_f, _, _] = mF(x, fs, seg_len, seg_len, round(fs * st_win),
                      round(fs * st_step))
    fv = (mt_f[:, 0] - mu) / std
    # classify vector:
    [res, prob] = aT.classifier_wrapper(classifier, "svm_rbf", fv)
    win_class = class_names[int(res)]
    if prob[class_names.index("silence")] > 0.8:
        soft_valence = 0
        soft_energy = 0
        print("Silence")
    else:
        # extract features for music mood
        [f_2, _, _] = mF(x, fs, round(fs * mt_win_en), round(fs * mt_step_en),
                         round(fs * st_win_en), round(fs * st_step_en))
        [f_3, _, _] = mF(x, fs, round(fs * mt_win_va), round(fs * mt_step_va),
                         round(fs * st_win_va), round(fs * st_step_va))

        # normalize feature vector
        fv_2 = (f_2[:, 0] - mu_energy) / std_energy
        fv_3 = (f_3[:, 0] - mu_valence) / std_valence
        [res_energy, p_en] = aT.classifier_wrapper(clf_energy, "svm_rbf", fv_2)

        win_class_energy = class_names_energy[int(res_energy)]
        [res_valence, p_val] = aT.classifier_wrapper(clf_valence, "svm_rbf",
                                                     fv_3)

        win_class_valence = class_names_valence[int(res_valence)]
        soft_energy = p_en[class_names_energy.index("high")] - \
                        p_en[class_names_energy.index("low")]
        soft_valence = p_val[class_names_valence.index("positive")] - \
                        p_val[class_names_valence.index("negative")]

        print(win_class, win_class_energy, win_class_valence, soft_valence,
              soft_energy)

    global prev_valence_and_energy
    if prev_valence_and_energy == None:
        prev_valence_and_energy = (soft_valence, soft_energy)
    valence_difference = abs(prev_valence_and_energy[0] - soft_valence)
    energy_difference = abs(prev_valence_and_energy[1] - soft_energy)
    bound = 0.2
    should_change = valence_difference > bound or energy_difference > bound

    all_data += mid_buf
    mid_buf = []
    h, w, _ = img
    y_center, x_center = int(h / 2), int(w / 2)
    x = x_center + int(
        (w / 2) *
        soft_valence if not should_change else prev_valence_and_energy[0])
    y = y_center - int(
        (h / 2) *
        soft_energy if not should_change else prev_valence_and_energy[1])

    if should_change:
        prev_valence_and_energy = (soft_valence, soft_energy)

    radius = 20
    alpha = format(r_map, '02x')
    return [soft_valence, soft_energy, alpha]
Ejemplo n.º 13
0
def mid_term_file_classification(input_file,
                                 model_name,
                                 model_type,
                                 plot_results=False):
    """
    This function performs mid-term classification of an audio stream.
    Towards this end, supervised knowledge is used,
    i.e. a pre-trained classifier.
    ARGUMENTS:
        - input_file:        path of the input WAV file
        - model_name:        name of the classification model
        - model_type:        svm or knn depending on the classifier type
        - plot_results:      True if results are to be plotted using
                             matplotlib along with a set of statistics

    RETURNS:
          - segs:           a sequence of segment's endpoints: segs[i] is the
                            endpoint of the i-th segment (in seconds)
          - classes:        a sequence of class flags: class[i] is the
                            class ID of the i-th segment
    """
    labels = []
    accuracy = 0.0
    class_names = []
    cm = np.array([])
    # print("model_name: ", model_name)
    if not os.path.isfile(model_name):
        # print("mtFileClassificationError: input model_type not found!")
        return labels, class_names, accuracy, cm

    # print("class_names: ", class_names)

    classifier, mean, std, class_names, mt_win, mid_step, st_win, \
    st_step, compute_beat = at.load_model(model_name)

    # load input file
    sampling_rate, signal = audioBasicIO.read_audio_file(input_file)

    print("signal: ", signal.shape)

    # convert stereo (if) to mono
    signal = audioBasicIO.stereo_to_mono(signal)

    # mid-term feature extraction:
    mt_feats, _, _ = mtf.mid_feature_extraction(signal, sampling_rate,
                                                mt_win * sampling_rate,
                                                mid_step * sampling_rate,
                                                round(sampling_rate * st_win),
                                                round(sampling_rate * st_step))
    class_probabilities = []

    # print("class_names: ", class_names)

    # for each feature vector (i.e. for each fix-sized segment):
    for col_index in range(mt_feats.shape[1]):
        # normalize current feature v
        feature_vector = (mt_feats[:, col_index] - mean) / std
        # print("col_index: ", col_index)
        # classify vector:
        label_predicted, prob = at.classifier_wrapper(classifier, model_type,
                                                      feature_vector)
        labels.append(label_predicted)

        # update probability matrix
        class_probabilities.append(prob)
    labels = np.array(labels)

    return labels, class_names, mid_step, class_probabilities
Ejemplo n.º 14
0
def record_audio(block_size,
                 fs=8000,
                 show_spec=False,
                 show_chroma=False,
                 log_sounds=False,
                 logs_all=False):

    # inialize recording process
    mid_buf_size = int(fs * block_size)
    pa = pyaudio.PyAudio()
    stream = pa.open(format=FORMAT,
                     channels=1,
                     rate=fs,
                     input=True,
                     frames_per_buffer=mid_buf_size)
    mid_buf = []
    count = 0
    global all_data
    global outstr
    all_data = []
    # initalize counters etc
    time_start = time.time()
    outstr = datetime.datetime.now().strftime("%Y_%m_%d_%I:%M%p")
    out_folder = outstr + "_segments"
    if log_sounds:
        if not os.path.exists(out_folder):
            os.makedirs(out_folder)
    # load segment model
    [classifier, MEAN, STD, class_names, mt_win, mt_step, st_win, st_step,
     _] = aT.load_model("model")

    while 1:
        try:
            block = stream.read(mid_buf_size)
            count_b = len(block) / 2
            format = "%dh" % (count_b)
            shorts = struct.unpack(format, block)
            cur_win = list(shorts)
            mid_buf = mid_buf + cur_win
            del cur_win

            # time since recording started:
            e_time = (time.time() - time_start)
            # data-driven time
            data_time = (count + 1) * block_size
            x = numpy.int16(mid_buf)
            seg_len = len(x)

            # extract features
            # We are using the signal length as mid term window and step,
            # in order to guarantee a mid-term feature sequence of len 1
            [mt_feats, _,
             _] = mF.mid_feature_extraction(x, fs, seg_len, seg_len,
                                            round(fs * st_win),
                                            round(fs * st_step))
            cur_fv = (mt_feats[:, 0] - MEAN) / STD
            # classify vector:
            [res, prob] = aT.classifier_wrapper(classifier, "svm_rbf", cur_fv)
            win_class = class_names[int(res)]
            win_prob = prob[int(res)]

            if logs_all:
                all_data += mid_buf
            mid_buf = numpy.double(mid_buf)

            # Compute spectrogram
            if show_spec:
                (spec, t_axis,
                 freq_axis_s) = sF.spectrogram(mid_buf, fs, 0.050 * fs,
                                               0.050 * fs)
                freq_axis_s = numpy.array(freq_axis_s)  # frequency axis
                # most dominant frequencies (for each short-term window):
                dominant_freqs = freq_axis_s[numpy.argmax(spec, axis=1)]
                # get average most dominant freq
                max_freq = numpy.mean(dominant_freqs)
                max_freq_std = numpy.std(dominant_freqs)

            # Compute chromagram
            if show_chroma:
                (chrom, TimeAxisC,
                 freq_axis_c) = sF.chromagram(mid_buf, fs, 0.050 * fs,
                                              0.050 * fs)
                freq_axis_c = numpy.array(freq_axis_c)
                # most dominant chroma classes:
                dominant_freqs_c = freq_axis_c[numpy.argmax(chrom, axis=1)]
                # get most common among all short-term windows
                max_freqC = most_common(dominant_freqs_c)[0]

            # Plot signal window
            signalPlotCV = plotCV(
                scipy.signal.resample(mid_buf + 16000, plot_w), plot_w, plot_h,
                32000)
            cv2.imshow('Signal', signalPlotCV)
            cv2.moveWindow('Signal', 50, status_h + 50)

            # Show spectrogram
            if show_spec:
                i_spec = numpy.array(spec.T * 255, dtype=numpy.uint8)
                i_spec2 = cv2.resize(i_spec, (plot_w, plot_h),
                                     interpolation=cv2.INTER_CUBIC)
                i_spec2 = cv2.applyColorMap(i_spec2, cv2.COLORMAP_JET)
                cv2.putText(i_spec2, "max_freq: %.0f Hz" % max_freq, (0, 11),
                            cv2.FONT_HERSHEY_PLAIN, 1, (200, 200, 200))
                cv2.imshow('Spectrogram', i_spec2)
                cv2.moveWindow('Spectrogram', 50, plot_h + status_h + 60)
            # Show chromagram
            if show_chroma:
                i_chroma = numpy.array((chrom.T / chrom.max()) * 255,
                                       dtype=numpy.uint8)
                i_chroma2 = cv2.resize(i_chroma, (plot_w, plot_h),
                                       interpolation=cv2.INTER_CUBIC)
                i_chroma2 = cv2.applyColorMap(i_chroma2, cv2.COLORMAP_JET)
                cv2.putText(i_chroma2, "max_freqC: %s" % max_freqC, (0, 11),
                            cv2.FONT_HERSHEY_PLAIN, 1, (200, 200, 200))
                cv2.imshow('Chroma', i_chroma2)
                cv2.moveWindow('Chroma', 50, 2 * plot_h + status_h + 60)

            # Activity Detection:
            print("{0:.2f}\t{1:s}\t{2:.2f}".format(e_time, win_class,
                                                   win_prob))

            if log_sounds:
                # TODO: log audio files
                out_file = os.path.join(
                    out_folder,
                    "{0:.2f}_".format(e_time).zfill(8) + win_class + ".wav")
                #shutil.copyfile("temp.wav", out_file)
                wavfile.write(out_file, fs, x)

            textIm = numpy.zeros((status_h, plot_w, 3))
            statusStrTime = "time: %.1f sec" % e_time + \
                            " - data time: %.1f sec" % data_time + \
                            " - loss : %.1f sec" % (e_time - data_time)
            cv2.putText(textIm, statusStrTime, (0, 11), cv2.FONT_HERSHEY_PLAIN,
                        1, (200, 200, 200))
            cv2.putText(textIm, win_class, (0, 33), cv2.FONT_HERSHEY_PLAIN, 1,
                        (0, 0, 255))
            cv2.imshow("Status", textIm)
            cv2.moveWindow("Status", 50, 0)
            mid_buf = []
            ch = cv2.waitKey(10)
            count += 1
        except IOError:
            print("Error recording")
Ejemplo n.º 15
0
def emotion_from_speech(Fs, x, log, model_name="pyAudioAnalysis/pyAudioAnalysis/data/svmSpeechEmotion", model_type="svm"):
    """

    :param Fs: frame rate
    :param x: data
    :param model_name:
    :param model_type:
    :param log:
    :return:
    """
    regression_models = glob.glob(model_name + "_*")
    regression_models2 = []
    for r in regression_models:
        if r[-5::] != "MEANS":
            regression_models2.append(r)
    regression_models = regression_models2
    regression_names = []
    for r in regression_models:
        regression_names.append(r[r.rfind("_")+1::])

    emotion = {"valence": None, "arousal":None}
    # Feature extraction
    x = np.fromstring(x, np.int16)
    if model_type == 'svm' or model_type == "svm_rbf" or model_type == 'randomforest':
        [_, _, _, mt_win, mt_step, st_win, st_step, compute_beat] = aT.load_model(regression_models[0], True)
    else:
        return emotion

    [mt_features, s, _] = aF.mtFeatureExtraction(x, Fs, mt_win * Fs, mt_step * Fs, round(Fs * st_win), round(Fs * st_step))
    mt_features = mt_features.mean(axis=1)        # long term averaging of mid-term statistics
    if compute_beat:
        [beat, beatConf] = aF.beatExtraction(s, st_step)
        mt_features = np.append(mt_features, beat)
        mt_features = np.append(mt_features, beatConf)

    # Regression
    R = []
    for ir, r in enumerate(regression_models):
        if not os.path.isfile(r):
            print("fileClassification: input model_name not found!")
            return emotion
        if model_type == 'svm' or model_type == "svm_rbf" or model_type == 'randomforest':
            [model, MEAN, STD, mt_win, mt_step, st_win, st_step, compute_beat] = aT.load_model(r, True)
        curFV = (mt_features - MEAN) / STD                  # normalization
        R.append(aT.regressionWrapper(model, model_type, curFV))

    if R[0] > 1:
        log.warning("Valence > 1")
        emotion["valence"] = 1
    elif R[0] < -1:
        log.warning("Valence < -1")
        emotion["valence"] = -1
    else:
        emotion["valence"] = R[0]

    if R[1] > 1:
        log.warning("Arousal > 1")
        emotion["arousal"] = 1
    elif R[1] < -1:
        log.warning("Arousal < -1")
        emotion["arousal"] = -1
    else:
        emotion["arousal"] = R[1]

    return emotion
Ejemplo n.º 16
0
def record_audio(block_size, devices, use_yeelight_bulbs=False, fs=8000):

    # initialize the yeelight devices:
    bulbs = []
    if use_yeelight_bulbs:
        for d in devices:
            bulbs.append(Bulb(d))
    try:
        bulbs[-1].turn_on()
    except:
        bulbs = []

    # initialize recording process
    mid_buf_size = int(fs * block_size)
    pa = pyaudio.PyAudio()
    stream = pa.open(format=FORMAT,
                     channels=1,
                     rate=fs,
                     input=True,
                     frames_per_buffer=mid_buf_size)

    mid_buf = []
    count = 0
    global all_data
    global outstr
    all_data = []
    outstr = datetime.datetime.now().strftime("%Y_%m_%d_%I:%M%p")

    # load segment model
    [classifier, mu, std, class_names, mt_win, mt_step, st_win, st_step,
     _] = aT.load_model("model")

    [clf_energy, mu_energy, std_energy, class_names_energy,
     mt_win_en, mt_step_en, st_win_en, st_step_en, _] = \
        aT.load_model("energy")

    [clf_valence, mu_valence, std_valence, class_names_valence,
     mt_win_va, mt_step_va, st_win_va, st_step_va, _] = \
        aT.load_model("valence")

    while 1:
        block = stream.read(mid_buf_size)
        count_b = len(block) / 2
        format = "%dh" % (count_b)
        shorts = struct.unpack(format, block)
        cur_win = list(shorts)
        mid_buf = mid_buf + cur_win
        del cur_win
        if len(mid_buf) >= 5 * fs:
            # data-driven time
            x = numpy.int16(mid_buf)
            seg_len = len(x)

            # extract features
            # We are using the signal length as mid term window and step,
            # in order to guarantee a mid-term feature sequence of len 1
            [mt_f, _, _] = mF(x, fs, seg_len, seg_len, round(fs * st_win),
                              round(fs * st_step))
            fv = (mt_f[:, 0] - mu) / std

            # classify vector:
            [res, prob] = aT.classifier_wrapper(classifier, "svm_rbf", fv)
            win_class = class_names[int(res)]
            if prob[class_names.index("silence")] > 0.8:
                soft_valence = 0
                soft_energy = 0
                print("Silence")
            else:
                # extract features for music mood
                [f_2, _, _] = mF(x, fs, round(fs * mt_win_en),
                                 round(fs * mt_step_en), round(fs * st_win_en),
                                 round(fs * st_step_en))
                [f_3, _, _] = mF(x, fs, round(fs * mt_win_va),
                                 round(fs * mt_step_va), round(fs * st_win_va),
                                 round(fs * st_step_va))
                # normalize feature vector
                fv_2 = (f_2[:, 0] - mu_energy) / std_energy
                fv_3 = (f_3[:, 0] - mu_valence) / std_valence

                [res_energy,
                 p_en] = aT.classifier_wrapper(clf_energy, "svm_rbf", fv_2)
                win_class_energy = class_names_energy[int(res_energy)]

                [res_valence,
                 p_val] = aT.classifier_wrapper(clf_valence, "svm_rbf", fv_3)
                win_class_valence = class_names_valence[int(res_valence)]

                soft_energy = p_en[class_names_energy.index("high")] - \
                              p_en[class_names_energy.index("low")]
                soft_valence = p_val[class_names_valence.index("positive")] - \
                               p_val[class_names_valence.index("negative")]

                print(win_class, win_class_energy, win_class_valence,
                      soft_valence, soft_energy)

            all_data += mid_buf
            mid_buf = []

            h, w, _ = img.shape
            y_center, x_center = int(h / 2), int(w / 2)
            x = x_center + int((w / 2) * soft_valence)
            y = y_center - int((h / 2) * soft_energy)

            radius = 20
            emo_map_img_2 = emo_map_img.copy()
            color = numpy.median(emo_map[y - 2:y + 2, x - 2:x + 2],
                                 axis=0).mean(axis=0)
            emo_map_img_2 = cv2.circle(
                emo_map_img_2, (x, y), radius,
                (int(color[0]), int(color[1]), int(color[2])), -1)
            emo_map_img_2 = cv2.circle(emo_map_img_2, (x, y), radius,
                                       (255, 255, 255), 2)
            cv2.imshow('Emotion Color Map', emo_map_img_2)

            # set yeelight bulb colors
            if use_yeelight_bulbs:
                for b in bulbs:
                    if b:
                        # attention: color is in bgr so we need to invert:
                        b.set_rgb(int(color[2]), int(color[1]), int(color[0]))

            cv2.waitKey(10)
            count += 1
Ejemplo n.º 17
0
def mtFileClassification(inputFile,
                         modelName,
                         modelType,
                         plotResults=False,
                         gtFile="",
                         return_for_user=False):
    '''
    This function performs mid-term classification of an audio stream.
    Towards this end, supervised knowledge is used, i.e. a pre-trained classifier.
    ARGUMENTS:
        - inputFile:        path of the input WAV file
        - modelName:        name of the classification model
        - modelType:        svm or knn depending on the classifier type
        - plotResults:      True if results are to be plotted using matplotlib along with a set of statistics

    RETURNS:
          - segs:           a sequence of segment's endpoints: segs[i] is the endpoint of the i-th segment (in seconds)
          - classes:        a sequence of class flags: class[i] is the class ID of the i-th segment
    '''

    if not os.path.isfile(modelName):
        print("mtFileClassificationError: input modelType not found!")
        return (-1, -1, -1, -1)
    # Load classifier:
    if modelType == "knn":
        [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = \
            aT.load_model_knn(modelName)
    else:
        [
            Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep,
            computeBEAT
        ] = aT.load_model(modelName)

    if computeBEAT:
        print("Model " + modelName + " contains long-term music features "
              "(beat etc) and cannot be used in "
              "segmentation")
        return (-1, -1, -1, -1)
    [Fs, x] = audioBasicIO.readAudioFile(inputFile)  # load input file
    if Fs == -1:  # could not read file
        return (-1, -1, -1, -1)
    x = audioBasicIO.stereo2mono(x)  # convert stereo (if) to mono
    Duration = len(x) / Fs
    # mid-term feature extraction:
    [MidTermFeatures, _] = aF.mtFeatureExtraction(x, Fs,
                                                  mtWin * Fs, mtStep * Fs,
                                                  round(Fs * stWin),
                                                  round(Fs * stStep))
    flags = []
    Ps = []
    flagsInd = []
    for i in range(
            MidTermFeatures.shape[1]
    ):  # for each feature vector (i.e. for each fix-sized segment):
        curFV = (MidTermFeatures[:, i] -
                 MEAN) / STD  # normalize current feature vector
        [Result, P] = aT.classifierWrapper(Classifier, modelType,
                                           curFV)  # classify vector
        flagsInd.append(Result)
        flags.append(classNames[int(Result)])  # update class label matrix
        Ps.append(numpy.max(P))  # update probability matrix
    flagsInd = numpy.array(flagsInd)

    # 1-window smoothing
    for i in range(1, len(flagsInd) - 1):
        if flagsInd[i - 1] == flagsInd[i + 1]:
            flagsInd[i] = flagsInd[i + 1]
            flags[i] = flags[i + 1]
    (segs, classes) = flags2segs(
        flags, mtStep)  # convert fix-sized flags to segments and classes
    segs[-1, 1] = len(x) / float(Fs)

    # Load grount-truth:
    if os.path.isfile(gtFile):
        [segStartGT, segEndGT, segLabelsGT] = readSegmentGT(gtFile)
        flagsGT, classNamesGT = segs2flags(segStartGT, segEndGT, segLabelsGT,
                                           mtStep)
        flagsIndGT = []
        for j, fl in enumerate(flagsGT):  # "align" labels with GT
            if classNamesGT[flagsGT[j]] in classNames:
                flagsIndGT.append(classNames.index(classNamesGT[flagsGT[j]]))
            else:
                flagsIndGT.append(-1)
        flagsIndGT = numpy.array(flagsIndGT)
        CM = numpy.zeros((len(classNamesGT), len(classNamesGT)))
        for i in range(min(flagsInd.shape[0], flagsIndGT.shape[0])):
            CM[int(flagsIndGT[i]), int(flagsInd[i])] += 1
    else:
        CM = []
        flagsIndGT = numpy.array([])

    acc = plotSegmentationResults(flagsInd, flagsIndGT, classNames, mtStep,
                                  not plotResults)
    if acc >= 0:
        print("Overall Accuracy: {0:.3f}".format(acc))
        if return_for_user:
            return {'segments': segs, 'classes': classes, 'accuracy': acc}
        else:
            return (flagsInd, classNamesGT, acc, CM)
    else:
        if return_for_user:
            return {'segments': segs, 'classes': classes}
        else:
            return (flagsInd, classNames, acc, CM)