コード例 #1
0
ファイル: main.py プロジェクト: wuweijia1994/SmartSpeaker
def audioRecorderCallback(fname):
    global command
    global cnt
    cnt.send_commands('3')
    print("converting audio to text")
    voice_cmd = transcribe.transcribe_file(fname).upper()
    for cmd in CMD:
        if voice_cmd.find(cmd.name)!=-1:
            command = cmd
            break

    print(voice_cmd)

    # snowboydecoder.play_audio_file(fname)
    # print(fname)
    # r = sr.Recognizer()
    # with sr.AudioFile(fname) as source:
    #     audio = r.record(source)  # read the entire audio file
    # # recognize speech using Google Speech Recognition
    # try:
    #     # for testing purposes, we're just using the default API key
    #     # to use another API key, use `r.recognize_google(audio, key="GOOGLE_SPEECH_RECOGNITION_API_KEY")`
    #     # instead of `r.recognize_google(audio)`
    #     print(r.recognize_google(audio))
    # except sr.UnknownValueError:
    #     print "Google Speech Recognition could not understand audio"
    # except sr.RequestError as e:
    #     print "Could not request results from Google Speech Recognition service; {0}".format(e)
    if mixer.music.get_busy() & (command != CMD.STOP) & (command != CMD.NEXT):
        mixer.music.unpause()

    os.remove(fname)
コード例 #2
0
    def post(self):
        print('received {}'.format(request.json))
        url = request.json["audio"]
        if os.path.exists("audio.wav"):
            os.remove("audio.wav")
        print('beggening download of {}'.format(url))
        wget.download(
            url,
            "audio.wav")  # please remember to change this with a subprocess
        print('downloaded succesfully')
        print('cleaning')
        p = os.popen(
            'python3 silenceremover.py 3 audio.wav')  # creats cleaned.wav
        time.sleep(8)

        inference_api.main(
            False
        )  # PLEASE CHANGE THIS HORRINDOUS THING INTO A SUBPROCESS AND DELETE THAT ABOMINATION
        print('done with the NN')
        # print('converting to mono')
        # try:
        #     stetomono.to_mono()
        # except:
        #     return('can\'t convert to mono for some reason.')

        print(p.read())
        print('transcribing')
        # try:
        transcription = transcribe.transcribe_file('cleaned.wav')
        # except:
        #     return('Please choose a file with a sample rate of an 8000 multiplication (16000,32000), reverting to last cached audio')

        return ("Transcription : " + transcription)
コード例 #3
0
def getWeather(q):
    weather = Weather()
    malespeaker.speak("Please tell me the city you are interested in")
    x = "I could not get that"
    while x == "I could not get that":
        x = transcribe.transcribe_file()

    print("city: " + x)
    # x=input()
    location = weather.lookup_by_location(x)
    condition = location.condition()
    #print(condition.text())

    # Get weather forecasts for the upcoming days.
    s = ""
    forecasts = location.forecast()
    i = 2

    for forecast in forecasts:
        s += forecast.text()
        s += "\n" + forecast.date()
        s += "\n Max" + forecast.high(
        ) + "(Degree Celsius) Min" + forecast.low() + "(Degree Celsius)"
        s += "\n\n"
        i += 1
        if i > 5:
            break
    print(s)
    malespeaker.speak(s)
コード例 #4
0
ファイル: bot.py プロジェクト: alicebalayan/Team-3
    async def stop_and_process(self, message):

        global servers
        guild = message.guild
        debug_channel = self.get_channel(805477547755175977)

        url = "http://localhost:3004/" + servers[guild][
            "recordingID"] + "?format=flac&container=aupzip"

        r = requests.get(url, allow_redirects=True)
        open('recording.zip', 'wb').write(r.content)
        with zipfile.ZipFile('recording.zip', 'r') as zip_ref:
            zip_ref.extractall("recording")
        os.remove("recording.zip")
        path = "recording/" + servers[guild]["recordingID"] + "_data/"
        listOfFiles = glob.glob(path + '*.ogg')
        print(listOfFiles)
        for rec in listOfFiles:
            username = rec[rec.index("-") + 1:]
            username = username[:username.index("_")]
            # print(username)
            # print(transcribe_file(rec))
            result = transcribe_file(rec)
            paragraph = username + " said "
            if len(result) == 0:
                continue
            for sentence in result:
                paragraph += sentence + " "
            # paragraph +=" "+str(len(result))
            process_content(paragraph)
            # TODO remove this later just here for debug
            await debug_channel.send(paragraph)
コード例 #5
0
ファイル: mic_server.py プロジェクト: NeoFlameS/Raspi_Git
    def handle(self):
        print('[%d]. [%s] 연결됨' % (num, self.client_address[0]))

        filename = self.request.recv(1024)  # 클라이언트로 부터 파일이름을 전달받음
        filename = filename.decode()  # 파일이름 이진 바이트 스트림 데이터를 일반 문자열로 변환

        with open(filename, 'wb') as f:
            try:
                while data:
                    f.write(data)
                    data = sock.request.recv(1024)
            except Exception as e:
                print(e)

        print('파일 전송종료.')
        transcribe.transcribe_file(filename)
コード例 #6
0
def worddown(q):
    malespeaker.speak("Please tell me which document to edit")
    tobenoted = transcribe.transcribe_file()
    os.system('start "" winword.exe C:\\Users\\"Aditya Goel"\\Documents\\' +
              tobenoted + '.docx')
    doc = Document('C:\\Users\\Aditya Goel\\Documents\\hello.docx')
    wholedoc = ""
    for para in doc.paragraphs:
        wholedoc = para.text
        malespeaker.speak(wholedoc)
    sleep(2)
コード例 #7
0
def notedown(q):
	malespeaker.speak("Please tell me note you want to write")
	tobenoted = transcribe.transcribe_file()
	subprocess.Popen("C:\Windows\System32\\notepad.exe", shell=True)
	time.sleep(2)
	pyautogui.click(250, 250);
	pyautogui.typewrite(tobenoted,0.02)
	pyautogui.hotkey('ctrl', 's')
	pyautogui.typewrite('note1.txt')
	time.sleep(1)
	pyautogui.press('enter')
	pyautogui.press('left')
	pyautogui.press('enter')
	malespeaker.speak("Your note has been saved successfully to note1.txt in documents")
コード例 #8
0
def response(bot: telegram.bot.Bot):
    """Echo the message the user sent."""
    global update_id
    # Request updates after the last update_id
    for update in bot.get_updates(offset=update_id, timeout=10):
        update_id = update.update_id + 1
        if update.message and update.message.voice:  # your bot can receive updates without messages
            # Reply to the message
            voice_message = update.message.voice
            file_id = voice_message.file_id
            unique_file_name = str(update_id) + "_voice.ogg"
            bot.get_file(file_id).download(custom_path=unique_file_name)
            transcript = transcribe_file(unique_file_name)
            update.message.reply_text(transcript)
            os.remove(unique_file_name)
コード例 #9
0
ファイル: ms_begin.py プロジェクト: AnshulMalik/MS_View
def main():
    """
       Greets the user.
       Checks when the system was last updated.
       Checks if there are reminders.
       Waits for user input.
       If input is "bye" (or similar), quits.
       Else passes the user input to the function msview() stored in msview.py
    """
    # greetings_path = os.path.dirname(os.getcwd()) + "/Text_Files/greetings.txt"
    # f_greetings = open(greetings_path, "r")
    # greetings_list = f_greetings.read().strip().split("\n")
    # f_greetings.close()
    # random_greeting = random.randrange(0, len(greetings_list))
    malespeaker.speak("Hello Sir!")

    # bye_path = os.path.dirname(os.getcwd()) + "/Text_Files/bye.txt"
    # f_bye = open(bye_path, "r")
    # bye_list = f_bye.read().strip().split("\n")
    # f_bye.close()
    # bye_list2 = [''.join(bye_list[i])+" Shrimp" for i in range(0, len(bye_list))]
    # bye_list3 = [''.join(bye_list[i])+" Mantis" for i in range(0, len(bye_list))]
    # bye_list4 = [''.join(bye_list[i]) + " MS" for i in range(0, len(bye_list))]
    while 1:
        malespeaker.speak("How can I help You?")
        voice_query = transcribe.transcribe_file()
        # voice_query = input('\033[1m'+'Username: '******'\033[0m')  # modify
        # print (voice_query)
        if voice_query:
            voice_query = voice_query.lower()
            # if voice_query in bye_list or \
            # voice_query in bye_list2 or \
            # voice_query in bye_list3 or \
            # voice_query in bye_list4:
            if "bye" in voice_query or "nothanks" in voice_query or "no thanks" in voice_query:
                time_now = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                if int(time_now[11:13]) > 20 or int(time_now[11:13]) < 4:
                    malespeaker.speak("Goodnight.")
                else:
                    malespeaker.speak('See you later then! Have a good day!')
                return
            queryHandlers.get(voice_query)
コード例 #10
0
        for part in subs:
            searchSub = re.search(r'\b' + data[i] + r'\b', part.text,
                                  re.IGNORECASE)
            if searchSub:
                filename = fnames[k]
                flac_filename = filename[0:38] + '.flac'
                newAudio = AudioSegment.from_file(audio_path + flac_filename)
                # Finds the part of the subs in which the word is being said and exports it as a new audio file for Google Speech Api
                t1 = part.start.minutes * 60 * 1000 + part.start.seconds * 1000 + part.start.milliseconds
                t2 = part.end.minutes * 60 * 1000 + part.end.seconds * 1000 + 1000 + part.end.milliseconds
                newAudio = newAudio[t1:t2]
                newAudio.export(temp_path + 'temporary' + str(i) + '.flac',
                                format='flac',
                                parameters=["-ac", "1"])

                word_info = transcribe_file(
                    temp_path + 'temporary' + str(i) + '.flac', data, i)
                if (word_info is not None):
                    print("\n")
                    print(word_info.word)
                    print(data[i])
                    if (word_info.word.lower() == data[i].lower()):
                        j = True
                        firstVisitedFlag = True
                        start_time = word_info.start_time
                        end_time = word_info.end_time
                        start_milliseconds = start_time.nanos * 1e-9
                        end_milliseconds = end_time.nanos * 1e-9
                        t1 = start_time.seconds * 1000 + start_milliseconds * 1000
                        t2 = end_time.seconds * 1000 + end_milliseconds * 1000
                        keywordAudio = AudioSegment.from_file(temp_path +
                                                              'temporary' +
コード例 #11
0
def test_transcribe_file(capsys):
    transcribe.transcribe_file(os.path.join(RESOURCES, "audio.raw"))
    out, err = capsys.readouterr()

    assert re.search(r"how old is the Brooklyn Bridge", out, re.DOTALL | re.I)
コード例 #12
0
ファイル: run_rnnt.py プロジェクト: luweishuang/RNNT_SR
def main(_):

    logging.info('Running with parameters:')
    logging.info(json.dumps(FLAGS.flag_values_dict(), indent=4))

    if os.path.exists(os.path.join(FLAGS.model_dir, 'config.json')):

        expect_partial = False
        if FLAGS.mode in ['transcribe-file', 'realtime']:
            expect_partial = True

        model = load_model(FLAGS.model_dir,
            checkpoint=FLAGS.checkpoint, expect_partial=expect_partial)

    else:

        if FLAGS.mode in ['eval', 'transcribe-file', 'realtime']:
            raise Exception('Model not found at path: {}'.format(
                FLAGS.model_dir))

        logging.info('Initializing model from scratch.')

        os.makedirs(FLAGS.model_dir, exist_ok=True)
        model_config_filepath = os.path.join(FLAGS.model_dir, 'config.json')

        vocab = vocabulary.init_vocab()
        vocabulary.save_vocab(vocab, os.path.join(FLAGS.model_dir, 'vocab'))

        model = Transducer(vocab=vocab,
                           encoder_layers=FLAGS.encoder_layers,
                           encoder_size=FLAGS.encoder_size,
                           pred_net_layers=FLAGS.pred_net_layers,
                           pred_net_size=FLAGS.pred_net_size,
                           joint_net_size=FLAGS.joint_net_size,
                           softmax_size=FLAGS.softmax_size)

        model.save_config(model_config_filepath)

        logging.info('Initialized model from scratch.')

    distribution_strategy = None

    if FLAGS.tpu is not None:

        tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
            tpu=FLAGS.tpu)
        distribution_strategy = tf.distribute.experimental.TPUStrategy(
            tpu_cluster_resolver=tpu_cluster_resolver)

    if FLAGS.mode == 'export':
        
        saved_model_dir = os.path.join(FLAGS.model_dir, 'saved_model')
        os.makedirs(saved_model_dir, exist_ok=True)
        
        all_versions = [int(ver) for ver in os.listdir(saved_model_dir)]

        if len(all_versions) > 0:
            version = max(all_versions) + 1
        else:
            version = 1

        export_path = os.path.join(saved_model_dir, str(version))
        os.makedirs(export_path)

        tf.saved_model.save(model, export_path, signatures={
            'serving_default': model.predict
        })

    elif FLAGS.mode == 'transcribe-file':

        transcription = transcribe_file(model, FLAGS.input)

        print('Input file: {}'.format(FLAGS.input))
        print('Transcription: {}'.format(transcription))

    elif FLAGS.mode == 'realtime':

        audio_buf = []
        last_result = None

        def stream_callback(in_data, frame_count, time_info, status):
            audio_buf.append(in_data)
            return None, pyaudio.paContinue

        def audio_gen():
            while True:
                if len(audio_buf) > 0:
                    audio_data = audio_buf[0]
                    audio_arr = np.frombuffer(audio_data, dtype=np.float32)
                    yield audio_arr

        FORMAT = pyaudio.paFloat32
        CHANNELS = 1
        RATE = 16000
        CHUNK = 2048

        audio = pyaudio.PyAudio()
        stream = audio.open(format=FORMAT,
                            channels=CHANNELS,
                            rate=RATE,
                            input=True,
                            frames_per_buffer=CHUNK,
                            stream_callback=stream_callback)
        
        stream.start_stream()

        outputs = transcribe_stream(model, audio_gen(), RATE)

        print('Transcribing live audio (press CTRL+C to stop)...')

        for (output, is_final) in outputs:
            if output != last_result and output != '' and not is_final:
                print('Partial Result: {}'.format(output))
                last_result = output
            if is_final:
                print('# Final Result: {}'.format(output))
                last_result = None

    else:

        if FLAGS.dataset_name == 'common-voice':
            data_utils = utils.data.common_voice

        train_dataset, dev_dataset = data_utils.create_datasets(FLAGS.dataset_path,
            max_data=FLAGS.max_data)

        if dev_dataset is None:
            dev_dataset = train_dataset.take(FLAGS.eval_size)
            train_dataset = train_dataset.skip(FLAGS.eval_size)

        if FLAGS.mode == 'eval':

            logging.info('Begin evaluation...')

            loss, acc = do_eval(model, dev_dataset,
                                batch_size=FLAGS.batch_size,
                                shuffle_buffer_size=FLAGS.shuffle_buffer_size,
                                distribution_strategy=distribution_strategy)

            logging.info('Evaluation complete: Loss {} Accuracy {}'.format(
                loss, acc))

        else:

            optimizer = tf.keras.optimizers.Adam(FLAGS.learning_rate)

            checkpoints_path = os.path.join(FLAGS.model_dir, 'checkpoints')
            os.makedirs(checkpoints_path, exist_ok=True)

            do_train(model, train_dataset, optimizer,
                     FLAGS.epochs, FLAGS.batch_size,
                     eval_dataset=dev_dataset,
                     steps_per_checkpoint=FLAGS.steps_per_checkpoint,
                     checkpoint_path=checkpoints_path,
                     steps_per_log=FLAGS.steps_per_log,
                     tb_log_dir=FLAGS.tb_log_dir,
                     keep_top_n=FLAGS.keep_top,
                     shuffle_buffer_size=FLAGS.shuffle_buffer_size,
                     distribution_strategy=distribution_strategy)
コード例 #13
0
def test_transcribe_file(capsys):
    transcribe.transcribe_file(os.path.join(RESOURCES, 'audio.raw'))
    out, err = capsys.readouterr()

    assert re.search(r'how old is the Brooklyn Bridge', out, re.DOTALL | re.I)
コード例 #14
0
def test_transcribe_file(resource, capsys):
    transcribe.transcribe_file(resource('audio.raw'))
    out, err = capsys.readouterr()

    assert re.search(r'how old is the Brooklyn Bridge', out, re.DOTALL | re.I)
コード例 #15
0
def main(_):

    logging.info('Running with parameters:')
    logging.info(json.dumps(FLAGS.flag_values_dict(), indent=4))

    if os.path.exists(os.path.join(FLAGS.model_dir, 'config.json')):

        expect_partial = False
        if FLAGS.mode in ['transcribe-file', 'realtime', 'export']:
            expect_partial = True

        model = load_model(FLAGS.model_dir,
            checkpoint=FLAGS.checkpoint, expect_partial=expect_partial)

    else:

        if FLAGS.mode in ['eval', 'transcribe-file', 'realtime']:
            raise Exception('Model not found at path: {}'.format(
                FLAGS.model_dir))

        logging.info('Initializing model from scratch.')

        os.makedirs(FLAGS.model_dir, exist_ok=True)
        model_config_filepath = os.path.join(FLAGS.model_dir, 'config.json')

        vocab = vocabulary.init_vocab()
        vocabulary.save_vocab(vocab, os.path.join(FLAGS.model_dir, 'vocab'))

        model = Transducer(vocab=vocab,
                           encoder_layers=FLAGS.encoder_layers,
                           encoder_size=FLAGS.encoder_size,
                           pred_net_layers=FLAGS.pred_net_layers,
                           joint_net_size=FLAGS.joint_net_size,
                           softmax_size=FLAGS.softmax_size)

        model.save_config(model_config_filepath)

        logging.info('Initialized model from scratch.')

    distribution_strategy = None

    if FLAGS.tpu is not None:

        tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
            tpu=FLAGS.tpu)
        distribution_strategy = tf.distribute.experimental.TPUStrategy(
            tpu_cluster_resolver=tpu_cluster_resolver)

    if FLAGS.mode == 'export':
        
        # saved_model_dir = os.path.join(FLAGS.model_dir, 'saved_model')
        # os.makedirs(saved_model_dir, exist_ok=True)
        
        # all_versions = [int(ver) for ver in os.listdir(saved_model_dir)]

        # if len(all_versions) > 0:
        #     version = max(all_versions) + 1
        # else:
        #     version = 1

        # export_path = os.path.join(saved_model_dir, str(version))
        # os.makedirs(export_path)

        # tf.saved_model.save(model, export_path, signatures={
        #     'serving_default': model.predict
        # })

        # print(model.predict(tf.zeros((1, 1024)), tf.constant([16000]), tf.constant(['hell']), tf.zeros((1, 2, 1, 2048))))

        tflite_dir = os.path.join(FLAGS.model_dir, 'lite')
        os.makedirs(tflite_dir, exist_ok=True)

        concrete_func = model.predict.get_concrete_function(
            audio=tf.TensorSpec([1, 1024], dtype=tf.float32),
            sr=tf.TensorSpec([1], dtype=tf.int32),
            pred_inp=tf.TensorSpec([1], dtype=tf.string),
            enc_state=tf.TensorSpec([1, 2, 1, model.encoder_size], dtype=tf.float32))

        converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func])
        converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS,
                                               tf.lite.OpsSet.SELECT_TF_OPS]
        converter.experimental_new_converter = True
        converter.experimental_new_quantizer = True
        converter.allow_custom_ops = True

        # def representative_dataset_gen():
        #     dataset, _ = load_datasets()
        #     for i in range(10):
        #         yield [next(dataset)]

        # converter.optimizations = [tf.lite.Optimize.DEFAULT]
        # converter.representative_dataset = representative_dataset_gen
        # converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
        # converter.inference_input_type = tf.uint8
        # converter.inference_output_type = tf.uint8

        tflite_quant_model = converter.convert()
        
        with open(os.path.join(tflite_dir, 'model.tflite'), 'wb') as f:
            f.write(tflite_quant_model)

        print('Exported model to TFLite.')

    elif FLAGS.mode == 'transcribe-file':

        transcription = transcribe_file(model, FLAGS.input)

        print('Input file: {}'.format(FLAGS.input))
        print('Transcription: {}'.format(transcription))

    elif FLAGS.mode == 'realtime':

        import pyaudio

        audio_buf = []
        last_result = None

        def stream_callback(in_data, frame_count, time_info, status):
            audio_buf.append(in_data)
            return None, pyaudio.paContinue

        def audio_gen():
            while True:
                if len(audio_buf) > 0:
                    audio_data = audio_buf[0]
                    audio_arr = np.frombuffer(audio_data, dtype=np.float32)
                    yield audio_arr

        FORMAT = pyaudio.paFloat32
        CHANNELS = 1
        RATE = 16000
        CHUNK = 2048

        audio = pyaudio.PyAudio()
        stream = audio.open(format=FORMAT,
                            channels=CHANNELS,
                            rate=RATE,
                            input=True,
                            frames_per_buffer=CHUNK,
                            stream_callback=stream_callback)
        
        stream.start_stream()

        outputs = transcribe_stream(model, audio_gen(), RATE)

        print('Transcribing live audio (press CTRL+C to stop)...')

        for (output, is_final) in outputs:
            if output != last_result and output != '' and not is_final:
                print('Partial Result: {}'.format(output))
                last_result = output
            if is_final:
                print('# Final Result: {}'.format(output))
                last_result = None

    else:

        train_dataset, dev_dataset = load_datasets()

        if dev_dataset is None:
            dev_dataset = train_dataset.take(FLAGS.eval_size)
            train_dataset = train_dataset.skip(FLAGS.eval_size)

        if FLAGS.eval_size:
            dev_dataset = dev_dataset.take(FLAGS.eval_size)

        if FLAGS.mode == 'eval':

            logging.info('Begin evaluation...')

            loss, acc = do_eval(model, dev_dataset,
                                batch_size=FLAGS.batch_size,
                                shuffle_buffer_size=FLAGS.shuffle_buffer_size,
                                distribution_strategy=distribution_strategy)

            logging.info('Evaluation complete: Loss {} Accuracy {}'.format(
                loss, acc))

        else:

            optimizer = tf.keras.optimizers.Adam(FLAGS.learning_rate)

            checkpoints_path = os.path.join(FLAGS.model_dir, 'checkpoints')
            os.makedirs(checkpoints_path, exist_ok=True)

            do_train(model, train_dataset, optimizer,
                     FLAGS.epochs, FLAGS.batch_size,
                     eval_dataset=dev_dataset,
                     steps_per_checkpoint=FLAGS.steps_per_checkpoint,
                     checkpoint_path=checkpoints_path,
                     steps_per_log=FLAGS.steps_per_log,
                     tb_log_dir=FLAGS.tb_log_dir,
                     keep_top_n=FLAGS.keep_top,
                     shuffle_buffer_size=FLAGS.shuffle_buffer_size,
                     distribution_strategy=distribution_strategy)
コード例 #16
0
ファイル: Main.py プロジェクト: aleks24000/RPiRecorder
        passed = True
        command_line = "arecord --device=hw:" + str(
            AUDIO_CONFIG['record_device']
        ) + ",0 --format S16_LE --rate " + str(
            AUDIO_CONFIG['rate']) + " -c" + str(
                AUDIO_CONFIG['channel']) + " temp_audio.wav &"
        args = shlex.split(command_line)
        proc = subprocess.Popen(args)
        print("PID:" + str(proc.pid))

    if GPIO.input(3) != GPIO.HIGH and passed:
        print("Button released")
        time.sleep(0.5)
        passed = False
        command_line = "kill " + str(proc.pid)
        os.system(command_line)
        if AUDIO_CONFIG['channel'] == 2:
            command_line = "sox temp_audio.wav -c 1 temp_audio_mono.wav"
            os.system(command_line)
        else:
            command_line = "cp temp_audio.wav temp_audio_mono.wav"
            os.system(command_line)
        command_line = "flac -f temp_audio_mono.wav"
        os.system(command_line)

        transcribe_file("temp_audio_mono.flac")
        # cmd = "aplay -D hw:" + str(AUDIO_CONFIG['play_device']) + ",0 output.wav"
        cmd = "aplay output.wav"
        print("Launching cmd : " + cmd)
        os.system(cmd)
コード例 #17
0
def transcribe_input(audio_path, threadcount):
    file_path = convertToWav(audio_path)  # we can only process wav files
    # we need to convert the other codecs and sample rates that aren't wav 16khz
    filepath, ds = media_processor(file_path)
    return transcribe_file(filepath, ds, threadcount)