forked from Xinghua-TAO/AutoViDevSpeech
/
SpeechRecognition_IBM_Plus.py
64 lines (50 loc) · 2.44 KB
/
SpeechRecognition_IBM_Plus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
from watson_developer_cloud import SpeechToTextV1
import json
import pandas as pd
def recognize_speech_ibm_plus(yourkey, url_address, audio_input, csv_output):
# set your authorization
speech_to_text = SpeechToTextV1(
iam_apikey=yourkey,
url=url_address
)
# use IBM API to recognize your audio
files = audio_input
with open(files, 'rb') as audio_file:
speech_recognition_results = speech_to_text.recognize(
audio=audio_file,
model="en-US_NarrowbandModel",
# there are 8 models in the IMB API, check their website and get the most suitable one
content_type='audio/wav', # the format of your audio and wav is a recommended one
timestamps=True, # choose if you want to get the onset and offset of your transcription
).get_result()
# since the result is sent to a url in json, you need to download it
with open("C:/Users/taotao/Desktop/research/test/data.json", "w") as write_file:
json.dump(speech_recognition_results, write_file)
data = r"C:/Users/taotao/Desktop/research/test/data.json"
# Read and convert json to dictionary
def js_r(data):
with open(data, encoding='utf-8') as f_in:
return (json.load(f_in))
my_dic_data = js_r(data)
# make sure you remember the audio you opened and double check
print(audio_input)
# pick the lists we need from the dictionary and combine them in one list
dict_step1 = my_dic_data['results']
words = []
for n in dict_step1:
sentence = n['alternatives'][0]['timestamps']
words = words + sentence
# change the list we get to a table and add a head to it
list_headline = ['word', 'onset', 'offset']
table = pd.DataFrame(columns=list_headline, data=words)
table[['onset']] = table[['onset']]*1000
table[['offset']] = table[['offset']]*1000
# to fit this table with datavyu, we need to change the order of the columns
table = table[['onset', 'offset', 'word']]
print(table)
# store the table in a csv file
table.to_csv(csv_output)
recognize_speech_ibm_plus('4b9ZXrpMUL6jo5Q9lw1htmo_Y4xdiku9tydRKVY8Q9Pv',
'https://stream.watsonplatform.net/speech-to-text/api',
"C:/Users/taotao/Desktop/research/test/HHcut.wav",
"C:/Users/taotao/Desktop/research/test/HHcut.csv")