sub_stamps_path = DIR_SUBS[show]+'/'+episode_name+'_proc_sub_st.json' sub_text_path = DIR_SUBS[show]+'/'+episode_name+'_proc_sub_tx.json' sub_unttext_path = DIR_SUBS[show]+'/'+episode_name+'_proc_sub_untx.json' temp_path = DIR_SUBS[show]+'/'+'temp.srt' try: with open(sub_stamps_path, 'r') as fp1, open(sub_text_path, 'r') as fp2, open(sub_unttext_path, 'r') as fp3: sub_stamps[show][episode_name] = json.load(fp1) sub_text[show][episode_name] = json.load(fp2) raw_sub_text[show][episode_name] = json.load(fp3) except IOError: # is this required? with io.open(DIR_SUBS[show]+'/'+episode_name+'.srt', 'r', encoding='utf-8') as sub, open(temp_path, 'w') as temp_fp: # temporary file containing semi preprocessed subtitle # don't use if doing preprocessing? temp_fp.write(unicodedata.normalize('NFKD', sub.read()).encode('ascii', 'ignore')) # replace unicode chars with closest equivalents result = preprocessor.fetch_subtitle_data(temp_path) sub_stamps[show][episode_name] = result['sub_stamps'] sub_text[show][episode_name] = result['sub_text'] raw_sub_text[show][episode_name] = result['raw_sub_text'] with open(sub_stamps_path, 'w') as fp1, open(sub_text_path, 'w') as fp2, open(sub_unttext_path, 'w') as fp3: json.dump(sub_stamps[show][episode_name], fp1) json.dump(sub_text[show][episode_name], fp2) json.dump(raw_sub_text[show][episode_name], fp3) # for plot to subtitle mapping in a variable plot_to_sub, idf, tf_idf = {}, {}, {} for show in list_of_shows: plot_to_sub[show] = {} idf[show] = {} tf_idf[show] = {}
temp_path = DIR_SUBS + "/" + "temp.srt" try: with open(sub_stamps_path, "r") as fp1, open(sub_text_path, "r") as fp2, open(sub_unttext_path, "r") as fp3: sub_stamps.append(json.load(fp1)) sub_text.append(json.load(fp2)) untouched_sub_text.append(json.load(fp3)) except IOError: with open(sub_stamps_path, "w") as fp1, open(sub_text_path, "w") as fp2, open(sub_unttext_path, "w") as fp3: with io.open(DIR_SUBS + "/" + sub_file + ".srt", "r", encoding="utf-8") as sub, open( temp_path, "w" ) as temp_fp: # temporary file containing semi preprocessed subtitle temp_fp.write( unicodedata.normalize("NFKD", sub.read()).encode("ascii", "ignore") ) # replace unicode chars with closest equivalents t1, t2, t3 = fetch_subtitle_data(temp_path) sub_stamps.append(t1) sub_text.append(t2) untouched_sub_text.append(t3) json.dump(sub_stamps[-1], fp1) json.dump(sub_text[-1], fp2) json.dump(untouched_sub_text[-1], fp3) # for plot to subtitle and subtitle to shot # will work for the first time plot_to_sub = [None for i in range(no_episodes)] idf = [None for i in range(no_episodes)] tf_idf = [None for i in range(no_episodes)] for index, vid_file in enumerate(file_names): plot_to_sub_path = DIR_PLTSUB + "/" + vid_file + "_proc_pltsub.json" idf_path = DIR_PLTSUB + "/" + vid_file + "_idf.json"