def compare_pronunciations(word, show_name, file_name): """ Generates a plot showing the frequency of each pronunciation of a given word in corpus. """ word = word.lower() as_client = init_as_client() as_ep_ids, as_show_id = find_episodes(show_name, as_client) pdict_fp = './alignment_data/pronunciations/' \ '{}_prnc_dict.pickle'.format(file_name) if os.path.lexists(pdict_fp): with open(pdict_fp, 'rb') as handle: prnc_dict = pickle.load(handle) else: prnc_dict = compile_prnc_dict(as_ep_ids, file_name) plot_pronunciations(prnc_dict, word, show_name, file_name) return prnc_dict
def align_show(show_name, n_segs, file_name): """ Finds all the episode ids associated with show_name in the audiosearch db, find the corresponding transcript ids for each episode, downloads the episode audio from soundcloud, compiles the episode transcript from the audiosearch db, segments the audio and transcripts into bite-sized segments, runs each segment through p2fa to get phoneme-level alignments, and writes both regular and phoneme-level bookworm files. Parameters ---------- show_name : str The name of the particular podcast as it appears in the audiosearch API n_segs : int The number of chunks to split a transcript into during alignment with p2fa (more chunks ~> faster alignment) file_name : str A nickname for the podcast. This is used when saving the audio files, naming the bookworm folders, writing transcript jsons, etc. """ make_alignments_directory() as_client = init_as_client() as_ep_ids, as_show_id = find_episodes(show_name, as_client) trans_dict = find_episode_transcript_ids(as_ep_ids) compile_audio_and_transcripts(trans_dict, n_segs, as_client, file_name) as_ep_ids, problem_episodes = align_transcripts(as_ep_ids) compile_alignments_bookworm(as_ep_ids, file_name) prnc_dict = compile_prnc_dict(as_ep_ids, pdict_fp) pause_dict = compile_pause_dict(file_name, prnc_dict) plot_pause_words(show_name, file_name, pause_dict)
def compile_alignments_bookworm(as_ep_ids, file_name): """ Reads the phoneme alignments for each episode, matches them to the corresponding line in transcript, gets the appropriate episode metadata, and constructs the corresponding catalog.json and input.txt files for both a phoneme bookworm and a regular bookworm. Parameters ---------- as_ep_ids : list The list of audiosearch episode ids associated with a particular podcast file_name : str A nickname for the podcast. This is used when saving the audio files, naming the bookworm folders, writing transcript jsons, etc. """ counter = 0 # db = db_connect() db = Postgres_Connect().connection as_client = init_as_client() make_bw_directories(file_name) ts_csv = load_timesteps(file_name) trans_dict = find_episode_transcript_ids(as_ep_ids) for (ep_num, trans_id) in trans_dict.items(): phoneme_file = \ './alignment_data/alignments_json/{}_seg*.json'.format(ep_num) if len(glob.glob(phoneme_file)) == 0: print('No phoneme alignments for episode' \ ' {}. Skipping.'.format(ep_num)) continue transcript = compile_episode_transcript(trans_id, db) meta = collect_episode_metadata(db, ep_num, as_client) if len(transcript) == 0: print('Unable to find transcript ID {} ' \ 'in AS database. Skipping.'.format(trans_id)) continue phoneme_transcript = [] for fp in np.sort(glob.glob(phoneme_file)): phoneme_transcript = phoneme_transcript + \ compile_phoneme_transcript(fp, transcript) print('Writing BW entry for {}'.format(phoneme_file)) # phoneme_transcript[idx] = # [phoneme sentence, transcript sentence, transcript index] phoneme_transcript = np.asarray(phoneme_transcript) counter = write_bw_catalog(transcript, phoneme_transcript, counter, ep_num, meta, file_name) write_field_descriptions(file_name) shutil.move('./metadata', './' + file_name) shutil.move('./texts', './' + file_name) os.mkdir('./' + file_name + '_bookworm') shutil.move('./' + file_name, './' + file_name + '_bookworm') shutil.move('./' + file_name + '_phonemes', './' + file_name + '_bookworm')