Esempio n. 1
0
def about():
    # Initializing variables passed to HTML files
    spectrogram_3d = None
    # Creating variables for 3d spectrogram plot
    vis_text, vis_spectrogram_feature, vis_audio_path, sample_rate, samples = make_predictions.vis_audio_features(
        index=np.random.randint(0, 4176), partition='test')
    freqs, times, log_spectrogram = make_predictions.log_spectrogram_feature(
        samples, sample_rate)
    mean = np.mean(log_spectrogram, axis=0)
    std = np.std(log_spectrogram, axis=0)
    log_spectrogram = (log_spectrogram - mean) / std

    # 3d plot of the spectrogram of a random audio file from the test set, plotting amplitude over frequency over time.
    def plot_3d_spectrogram(log_spectrogram):
        data = [go.Surface(z=log_spectrogram.T, colorscale='Viridis')]
        layout = go.Layout(title='3D Spectrogram',
                           autosize=True,
                           width=700,
                           height=700,
                           margin=dict(l=50, r=50, b=50, t=50))
        fig = go.Figure(data=data, layout=layout)
        div_output = plot(fig, output_type='div', include_plotlyjs=False)
        return div_output

    # Converting 3d plot for JavaScript rendering
    spectrogram_3d = plot_3d_spectrogram(log_spectrogram)
    spectrogram_3d = Markup(spectrogram_3d)
    # render the HTML page
    return render_template('about.html', spectrogram_3d=spectrogram_3d)
Esempio n. 2
0
def visualization():
    # Initializing form for user input
    visualization_form = VisualizationForm()
    # Initializing variables passed to HTML files
    truth_transcription = None
    prediction_transcription = None
    raw_plot = None
    spectrogram_plot = None
    spectrogram_shape = None
    log_spectrogram_plot = None
    spectrogram_3d = None
    cortana_transcription = None
    recognitionstatus = None
    offset = None
    duration = None
    nbest = None
    confidence = None
    lexical = None
    itn = None
    maskeditn = None
    display = None
    play_audio = None

    # Form for visualization engine
    if visualization_form.validate_on_submit():
        v_model_number = visualization_form.viz_model_number.data
        v_partition = visualization_form.viz_partition.data
        v_instance_number = visualization_form.viz_instance_number.data
        # Get ground truth and predicted transcriptions
        if v_model_number == 'model_10':
            truth_transcription = make_predictions.get_ground_truth(
                index=v_instance_number,
                partition=v_partition,
                input_to_softmax=make_predictions.model_10,
                model_path='./results/model_10.h5')
            prediction_transcription = make_predictions.get_prediction(
                index=v_instance_number,
                partition=v_partition,
                input_to_softmax=make_predictions.model_10,
                model_path='./results/model_10.h5')
        else:
            truth_transcription = make_predictions.get_ground_truth(
                index=v_instance_number,
                partition=v_partition,
                input_to_softmax=make_predictions.model_8,
                model_path='./results/model_8.h5')
            prediction_transcription = make_predictions.get_prediction(
                index=v_instance_number,
                partition=v_partition,
                input_to_softmax=make_predictions.model_8,
                model_path='./results/model_8.h5')
        # Get features for visualizations
        vis_text, vis_spectrogram_feature, vis_audio_path, sample_rate, samples = make_predictions.vis_audio_features(
            index=v_instance_number, partition=v_partition)
        # Plot the audio waveform
        raw_plot = make_predictions.plot_raw_audio(sample_rate, samples)
        # Plot the spectrogram of the audio file
        spectrogram_plot = make_predictions.plot_spectrogram_feature(
            vis_spectrogram_feature)
        spectrogram_shape = 'The shape of the spectrogram of the chosen audio file: ' + str(
            vis_spectrogram_feature.shape)
        # 2nd way to plot the spectrogram of the audio file
        freqs, times, log_spectrogram = make_predictions.log_spectrogram_feature(
            samples, sample_rate)
        mean = np.mean(log_spectrogram, axis=0)
        std = np.std(log_spectrogram, axis=0)
        log_spectrogram = (log_spectrogram - mean) / std
        log_spectrogram_plot = make_predictions.plot_log_spectrogram_feature(
            freqs, times, log_spectrogram)

        # 3d plot of the spectrogram of a random audio file from the test set, plotting amplitude over frequency over time.
        def plot_3d_spectrogram(log_spectrogram):
            data = [go.Surface(z=log_spectrogram.T, colorscale='Viridis')]
            layout = go.Layout(title='3D Spectrogram',
                               autosize=True,
                               width=700,
                               height=700,
                               margin=dict(l=50, r=50, b=50, t=50))
            fig = go.Figure(data=data, layout=layout)
            div_output = plot(fig, output_type='div', include_plotlyjs=False)
            return div_output

        # 3d spectrogram plot
        spectrogram_3d = plot_3d_spectrogram(log_spectrogram)
        spectrogram_3d = Markup(spectrogram_3d)
        # Connecting to Microsoft Speech API for Cortana's predicted transcription
        filepath = make_predictions.azure_inference(index=v_instance_number,
                                                    partition=v_partition)
        audiofile = open(filepath, 'rb')
        response = requests.post(
            'https://westus.stt.speech.microsoft.com/speech/recognition/conversation/cognitiveservices/v1',
            headers=headers,
            params=params,
            data=make_predictions.read_in_chunks(audiofile))
        cortana_transcription = response.content
        val = json.loads(response.text)
        recognitionstatus = val["RecognitionStatus"]
        offset = val["Offset"]
        duration = val["Duration"]
        nbest = val["NBest"]
        confidence = val["NBest"][0]["Confidence"]
        lexical = val["NBest"][0]["Lexical"]
        itn = val["NBest"][0]["ITN"]
        maskeditn = val["NBest"][0]["MaskedITN"]
        display = val["NBest"][0]["Display"]
        # Serve the audio file for the audio player
        play_audio = filepath.replace("/home/brice/Hey-Jetson/app/", "")

    # Render the html page.
    return render_template('visualization.html',
                           visualization_form=visualization_form,
                           truth_transcription=truth_transcription,
                           prediction_transcription=prediction_transcription,
                           raw_plot=raw_plot,
                           spectrogram_plot=spectrogram_plot,
                           log_spectrogram_plot=log_spectrogram_plot,
                           spectrogram_shape=spectrogram_shape,
                           spectrogram_3d=spectrogram_3d,
                           cortana_transcription=cortana_transcription,
                           confidence=confidence,
                           lexical=lexical,
                           itn=itn,
                           maskeditn=maskeditn,
                           display=display,
                           play_audio=play_audio)
Esempio n. 3
0
def index():
    # Initializing form for user input
    audio_form = AudioForm(CombinedMultiDict((request.files, request.form)))
    # Initializing variables passed to HTML files
    filename = None
    prediction_transcription = None
    raw_plot = None
    spectrogram_plot = None
    spectrogram_shape = None
    log_spectrogram_plot = None
    spectrogram_3d = None
    word_error_rate = None
    cv_similarity = None
    jetson_time_to_predict = None
    cortana_time_to_predict = None
    cortana_transcription = None
    recognitionstatus = None
    offset = None
    duration = None
    nbest = None
    confidence = None
    lexical = None
    itn = None
    maskeditn = None
    display = None
    sentiments = None
    documents = None
    errors = None
    prediction_score = None
    prediction_id = None
    cortana_score = None
    cortana_id = None

    # Form for inference engine
    if audio_form.validate_on_submit():
        f = audio_form.audio_file.data
        filename = os.path.join('app/static/audio/', 'tmp.wav')
        f.save(filename)
        # Connecting to Microsoft Speech API for Cortana's predicted transcription
        c_start = time.time()
        audiofile = open(filename, 'rb')
        response = requests.post(
            'https://westus.stt.speech.microsoft.com/speech/recognition/conversation/cognitiveservices/v1',
            headers=headers,
            params=params,
            data=make_predictions.read_in_chunks(audiofile))
        cortana_transcription = response.content
        c_end = time.time()
        cortana_time_to_predict = c_end - c_start
        val = json.loads(response.text)
        recognitionstatus = val["RecognitionStatus"]
        offset = val["Offset"]
        duration = val["Duration"]
        nbest = val["NBest"]
        confidence = val["NBest"][0]["Confidence"]
        lexical = val["NBest"][0]["Lexical"]
        itn = val["NBest"][0]["ITN"]
        maskeditn = val["NBest"][0]["MaskedITN"]
        display = val["NBest"][0]["Display"]
        # Producing Hey, Jetson! predicted transcription
        s_start = time.time()
        prediction_transcription = make_predictions.run_inference(
            audio_path=filename,
            input_to_softmax=make_predictions.model_10,
            model_path='./results/model_10.h5')
        s_end = time.time()
        jetson_time_to_predict = s_end - s_start
        vis_spectrogram_feature, sample_rate, samples = make_predictions.inference_vis_audio_features(
            index=filename)
        # Plot the audio waveform
        raw_plot = make_predictions.plot_raw_audio(sample_rate, samples)
        # Plot the spectrogram of the audio file
        spectrogram_plot = make_predictions.plot_spectrogram_feature(
            vis_spectrogram_feature)
        spectrogram_shape = 'The shape of the spectrogram of the uploaded audio file: ' + str(
            vis_spectrogram_feature.shape)
        # 2nd way to plot the spectrogram of the audio file
        freqs, times, log_spectrogram = make_predictions.log_spectrogram_feature(
            samples, sample_rate)
        mean = np.mean(log_spectrogram, axis=0)
        std = np.std(log_spectrogram, axis=0)
        log_spectrogram = (log_spectrogram - mean) / std
        log_spectrogram_plot = make_predictions.plot_log_spectrogram_feature(
            freqs, times, log_spectrogram)

        # 3d plot of the spectrogram of a random audio file from the test set, plotting amplitude over frequency over time.
        def plot_3d_spectrogram(log_spectrogram):
            data = [go.Surface(z=log_spectrogram.T, colorscale='Viridis')]
            layout = go.Layout(title='3D Spectrogram',
                               autosize=True,
                               width=700,
                               height=700,
                               margin=dict(l=50, r=50, b=50, t=50))
            fig = go.Figure(data=data, layout=layout)
            div_output = plot(fig, output_type='div', include_plotlyjs=False)
            return div_output

        # 3d spectrogram plot
        spectrogram_3d = plot_3d_spectrogram(log_spectrogram)
        spectrogram_3d = Markup(spectrogram_3d)
        # Connecting to Microsoft Text Analytics API for sentiment analysis
        text_documents = {
            'documents': [{
                'id': 'Predicted Transcription',
                'language': 'en',
                'text': prediction_transcription
            }, {
                'id': 'Cortana Transcription',
                'language': 'en',
                'text': lexical
            }]
        }
        sentiment_response = requests.post(sentiment_api_url,
                                           headers=text_headers,
                                           json=text_documents)
        sentiments = sentiment_response.json()
        documents = sentiments["documents"]
        errors = sentiments["errors"]
        prediction_score = sentiments["documents"][0]["score"]
        prediction_id = sentiments["documents"][0]["id"]
        cortana_score = sentiments["documents"][1]["score"]
        cortana_id = sentiments["documents"][1]["id"]

    # Render the html page.
    return render_template('index.html',
                           audio_form=audio_form,
                           filename=filename,
                           prediction_transcription=prediction_transcription,
                           raw_plot=raw_plot,
                           spectrogram_plot=spectrogram_plot,
                           log_spectrogram_plot=log_spectrogram_plot,
                           spectrogram_shape=spectrogram_shape,
                           spectrogram_3d=spectrogram_3d,
                           jetson_time_to_predict=jetson_time_to_predict,
                           cortana_time_to_predict=cortana_time_to_predict,
                           confidence=confidence,
                           lexical=lexical,
                           itn=itn,
                           maskeditn=maskeditn,
                           display=display,
                           prediction_score=prediction_score,
                           cortana_score=cortana_score)