def make_indexing(sminames, output_name, movie_dir, verbose = False): if type(sminames) == type('subtitle.smi'): sminames =[sminames] indexing_result = [] for sminame in sminames: smi = smipy.Smi(os.path.join(movie_dir, sminame)) subtitles = smi.subtitles print('Processing {}'.format(sminame)) if smi.sinc_verification: sinc = 'T' else: sinc = 'F' if smi.eng_signal is None: raise NameError('The english subtitle signal is not detected in {}'.format(sminame)) if smi.kor_signal is None: warnings.warn('The korean subtitle signal is not detected') for ind, sub in enumerate(subtitles): eng_sentence = sub['eng'] start_time = sub['start'] end_time = sub['end'] if eng_sentence is not None: token_list = to_tokens(eng_sentence) for word in token_list: indexing_result.append(dict( der_word=word, subtitle_ind=ind, eng_sent=eng_sentence, video_code=''.join(sminame.split('.')[:-1]), start=start_time, end=end_time, tokens = ' '.join(token_list), movie_dir = movie_dir, sinc = sinc )) df = pd.DataFrame(indexing_result) df.index.name = 'index' #### Saving results df_path = get_save_path(dir=movie_dir, name=output_name, format = '.csv') df.to_csv(df_path, encoding=CSV_ENCODING) return df
def make_subtitle(movie_name, directory, overlap=1): format_m = '.mp4' print('making subtitle {}'.format(movie_name)) all_names = [ n for n in os.listdir(directory) if n[:len(movie_name) + 1] == movie_name + '_' ] clip_names = [n for n in all_names if n[-4:] == format_m] text_names = [n for n in all_names if n[-4:] == '.txt'] clip_names.sort() durations = [] for cn in clip_names: clip = VideoFileClip(os.path.join(directory, cn)) dur = clip.duration durations.append(dur) times = [] for ind, dur in enumerate(durations): start_time = round(sum(durations[:ind]) - (ind) * overlap, 2) + overlap end_time = start_time + dur - overlap times.append((int(start_time * 1000), int(end_time * 1000) - 2)) subs = [] for ind, tn in enumerate(text_names): _js = open(os.path.join(directory, tn), 'r').read() txt = json.loads(_js)['text'] s, e = times[ind] subs.append(dict(start=s, end=e, eng=txt, kor='')) smi = smipy.Smi() smi.from_sentences(subs) smitext = smi.export() smi_path = os.path.join(directory, movie_name.split('.')[0] + '.smi') print('saving subtitle {}'.format(smi_path)) with open(smi_path, 'w') as f: f.write(smitext)
import os import smipy import naverapi from os.path import join as osj import numpy as np threshold_ms = 20000 directory = './friends' smi_names = [n for n in os.listdir(directory) if n[-4:] == '.smi'] bundle = 10 for n in smi_names: print(n) smi = smipy.Smi(osj(directory, n)) smi.kor_signal = 'KRCC' sub_len = len(smi.subtitles) bundle_len = int(np.ceil(sub_len / bundle)) for i_bundle in range(bundle_len): start = i_bundle * bundle end = start + bundle st = smi.subtitles[start:end] st = [s['eng'] for s in st] engtext = '\n'.join(st) tr = naverapi.papago_translate(engtext) if tr is not None: tr_list = tr.split('\\n') print(len(st), len(tr_list)) for ind, kor in enumerate(tr_list):
engsmis = [n for n in os.listdir(directory) if '_k' not in n and n[-4:] == '.smi'] kornames = [n for n in os.listdir(directory) if '_k' in n and n[-4:] == '.smi'] # for n in engsmis: # print(n) # text = open(os.path.join(directory,n), 'r', encoding='utf-8').read() # open(n, 'w').write(text.encode('cp949', errors = 'replace').decode('cp949')) # for n in kornames: # print(n) # text = open(os.path.join(directory,n), 'r', encoding = 'euc-kr').read() # open(n, 'w').write(text.encode('cp949', errors = 'replace').decode('cp949')) print(engsmis) for engname in engsmis: korname = engname.split('.')[0] + '_k' + '.smi' smieng = smipy.Smi(os.path.join(directory, engname)) smikor = smipy.Smi(os.path.join(directory, korname)) smieng.to_csv(engname, dest=directory) smikor.to_csv(korname, dest = directory) # engsubtitles = [] # for sub in smieng.subtitles: # line = sub['kor'] + sub['eng'] # line = line.strip() # engsubtitles.append(dict(start=sub['start'], end=sub['end'], kor='', eng=line)) # # allsent = [] # allsent.extend(engsubtitles) # allsent.extend(smikor.subtitles)
def make_word_list(originals, original_to_derivative,original_to_meaning, smi_df_lists, output_name, before_window = 20000, after_window = 10000, output_dir ='./'): Author = 'YB' ##################################### # # word list 가지고 단어장 초안 만드는 과정 # ##################################### #### indexing of word bag ori_to_index = {} for index, original in enumerate(originals): ori_to_index[original] = index der_to_index = {} for index, original in enumerate(originals): ders = original_to_derivative[original] for der in ders: der_to_index[der] = index derivative_to_ori = {} derivatives = [] for k, v in original_to_derivative.items(): for der in v: derivatives.append(der) derivative_to_ori[der] = k ori_counter = {} for ori in originals: ori_counter[ori] = 0 ori_counter_bigbang = {} for ori in originals: ori_counter_bigbang[ori] = 0 # list가 아니면 list화 if not type(smi_df_lists) == type([]): smi_df_lists = [smi_df_lists] # #df구조 # indexing_result.append(dict( # der_word=word, subtitle_ind=ind, eng_sent=eng_sentence, # video_code=''.join(sminame.split('.')[:-1]), # start=start_time, end=end_time, tokens=' '.join(token_list), # movie_dir=movie_dir, sinc=smi.sinc_verification # )) res_list = [] for df in smi_df_lists: #경로모드 if type(df) == type('path.csv'): if df[-4:] == '.csv': df = df[:-4] df = pd.read_csv(os.path.join(output_dir, df + '.csv'), encoding = CSV_ENCODING) for derivative in derivatives: df_sel = df[df['der_word'] == derivative] # indexing_result.append(dict( # der_word=word, subtitle_ind=ind, eng_sent=eng_sentence, # video_code=''.join(sminame.split('.')[:-1]), # start=start_time, end=end_time, tokens=' '.join(token_list), # movie_dir=movie_dir, sinc=smi.sinc_verification # )) for _i in range(len(df_sel)): row = df_sel.iloc[_i] d = dict(row) ind = der_to_index[derivative] ori = derivative_to_ori[derivative] d['word_ind'] = der_to_index[derivative] d['ori_word'] = derivative_to_ori[derivative] d['clip_index'] = ori_counter[ori] + 10 ori_counter[ori] +=1 if d['video_code'][:2] == 'BB': ori_counter_bigbang[ori] +=1 # pick neighbors of sentences smipath = os.path.join(d['movie_dir'], d['video_code'] + '.smi') smi = smipy.Smi(smipath) neighbor_start = d['start'] - before_window neighbor_end = d['end'] + after_window cut_list, cut_ind = smi.slice(start_time=neighbor_start, end_time=neighbor_end) try: _senti = cut_ind.index(d['subtitle_ind']) before_list = cut_list[:_senti] after_list = cut_list[_senti + 1:] except: before_list = [] after_list = [] before_text = '\n'.join([sent['eng'] for sent in before_list]) after_text = '\n'.join([sent['eng'] for sent in after_list]) _before_kr = '\n'.join([sent['kor'] for sent in before_list]) _after_kr = '\n'.join([sent['kor'] for sent in after_list]) d['_before_no'] = len(before_list) d['_after_no'] = len(after_list) d['before_text'] = before_text d['after_text'] = after_text d['_before_kor'] = _before_kr d['_after_kor'] = _after_kr d['word_meaning'] = original_to_meaning[ori] d['verify'] = 'F' if ori_counter_bigbang[ori] > 10: pass else: res_list.append(d) df = pd.DataFrame.from_dict(res_list) #abstract result abstract_result = [] for ind, ori in enumerate(originals): no_occur = ori_counter[ori] abstract_result.append(dict(ori_word = ori, occurance = no_occur)) df_abs = pd.DataFrame.from_dict(abstract_result) #Saving df out_path = get_save_path(dir = output_dir, name = output_name, format = '.csv') out_path_abs = get_save_path(dir=output_dir, name=output_name + '_abs', format='.csv') df.to_csv(out_path, encoding= CSV_ENCODING) df_abs.to_csv(out_path_abs, encoding = CSV_ENCODING)
def make_clip(words_path, title, out_dir = './clips', pad = 2000, encoding = 'utf-8'): ########################################## # # clip # 밑 변수 순서나 인덱싱 재정리할것!! # ########################################## try: worddf = pd.read_csv(words_path, encoding = encoding) except: worddf = pd.read_csv(words_path, encoding = CSV_ENCODING) clip_result_list = [] for i in range(len(worddf)): row = dict(worddf.iloc[i]) ori = row['ori_word'] der = row['der_word'] clip_index = row['clip_index'] start_time = row['start'] - pad end_time = row['end'].item() + pad word_meaning = row['word_meaning'] movie_dir = row['movie_dir'] video_code = row['video_code'] if video_code[:2] == 'BB' : video_name = 'Big Bang Theory' elif video_code[:2] == 'SI': video_name = 'Silicon Valley' elif video_code[:2] == 'FR': video_name = 'Friends' else: video_name = '' video_name = video_name + ' ' + 'Season {} Ep{}'.format(video_code[2], video_code[3:]) eng_sent = row['eng_sent'] word_ind = row['word_ind'] word_loc = -1 for _i, _w in enumerate(eng_sent.split(' ')): if der in ''.join(to_tokens(_w)): word_loc = _i clip_code = '{}'.format(11000000 + clip_index + int(row['word_ind']) * 1000) clip_code = clip_code[-7:] smipath = os.path.join(row['movie_dir'], row['video_code'] + '.smi') smi = smipy.Smi(smipath) # before after lines 중심으로 movie start end time 알아내기 if type(row['before_text']) == type('sometext'): before_no = len(row['before_text'].strip().split('\n')) else: before_no = 0 if type(row['after_text']) == type('sometext'): after_no = len(row['after_text'].strip().split('\n')) else: after_no = 0 sub_index = row['subtitle_ind'] kor_sent = smi.subtitles[sub_index]['kor'] sub_slice = smi.subtitles[sub_index - before_no:sub_index + 1 + after_no] clip_start = sub_slice[0]['start'] - pad clip_end = sub_slice[-1]['end'] + pad sent_start = row['start'] - clip_start sent_end = row['end'] - clip_start # for debugging, lines of neighbors before_list = sub_slice[:before_no] after_list = sub_slice[-after_no:] whole_eng = '\n'.join([sent['eng'] for sent in sub_slice]) whole_kor = '\n'.join([sent['kor'] for sent in sub_slice]) # before_text = '\n'.join([sent['eng'] for sent in before_list]) # after_text = '\n'.join([sent['eng'] for sent in after_list]) # _before_kr = '\n'.join([sent['kor'] for sent in before_list]) # _after_kr = '\n'.join([sent['kor'] for sent in after_list]) # export smi cliptxt = smi.export(clip_start, clip_end, slice_manual=sub_slice) with open(os.path.join(out_dir, clip_code + '.smi'), 'w') as f: f.write(cliptxt) # export sliced video try: ffmpeg_extract_subclip(filename = os.path.join(movie_dir, video_code + '.mkv'), t1 =clip_start / 1000, t2 =clip_end / 1000, targetname=os.path.join(out_dir, clip_code + '.mp4')) except: print('error in {}'.format(video_code)) continue # export final db for app d = dict(word_ind = word_ind, ori_word = ori, clip_code = clip_code, eng_sent =eng_sent, kor_sent = kor_sent, sent_start = sent_start, sent_end = sent_end, word_loc = word_loc, line_loc = before_no, whole_eng = whole_eng, whole_kor = whole_kor, word_meaning = word_meaning, video_name = video_name, _v_s = clip_start, _v_e = clip_end) clip_result_list.append(d) df = pd.DataFrame.from_dict(clip_result_list) df.to_csv(get_save_path(dir = out_dir, name = title, format = '.csv'), encoding = CSV_ENCODING)
import re import os import codecs import smipy smi = smipy.Smi('./testbb501.smi', debug=True) smi.to_csv(title='testbb501')