def __init__(self, workdir, sr=16000, max_length=1200): self.workdir = workdir self.sr = sr # self.max_length = max_length # self.abs_dist_path =t.create_folder(os.path.join(workdir, 'distances')) self.youtube_list = t.load_json(youtube_path) self.exports_list = t.load_json(exports_path)
def run(self, src, distances_path): folder_path = os.path.dirname(distances_path) try: t.create_folder(folder_path) distances = self.compute_dist(src) if distances is None: return None t.save_array(distances, distances_path) return distances_path except Exception as e: print(folder_path, e) return None
def process(self, youtube_path, exports_path, workdir): yt_list = t.load_json(youtube_path) ex_list = t.load_json(exports_path) self.test_len = 0 self.run_from = 41609 self.timestamp = time.time() self.list_to_process = yt_list + ex_list self.err_file = os.path.join(workdir, 'errors.json') run_to = self.run_from + self.test_len if self.test_len != 0 else len( self.list_to_process) errors = [] for idx in tqdm(range(run_to), ascii=True, desc='process {} links'.format(run_to)): if idx < self.run_from: continue record = self.list_to_process[idx] path, out_path_rel = record['src'], record['dst'] out_path = t.create_folder(os.path.join(workdir, out_path_rel)) name = 'image.png' name_info = 'info.json' out = os.path.join(out_path, name) out_info = os.path.join(out_path, name_info) if os.path.isfile(out): continue try: # print(f_idx) frame, info = self.get_full_frame(path) info['src'] = path # np.save(out, frame) cv2.imwrite(out, frame) with open(out_info, 'w') as outfile: json.dump(info, outfile, indent=4) except Exception as e: err = '{}: {}'.format(path, e) print(err) errors.append(err) # print(info) if len(errors): with open(self.err_file, 'w') as outfile: json.dump(errors, outfile, indent=4)
def run(self, src, vector_path): folder_path = os.path.dirname(vector_path) # try: t.create_folder(folder_path) # tmp_video_path = os.path.join(self.workdir, dst, os.path.basename(src)) # shutil.copy(src, tmp_video_path) video_vector = self.compute_vector(src) t.save_array(video_vector, vector_path) # return vector_path # except Exception as e: # print(folder_path, e) # return None return vector_path
def read_durations(self, data): for idx in tqdm(range(len(data)), ascii=True, desc='ffprobe on {} files'.format(len(data))): record = data[idx] try: duration = t.get_duration(record[n.SRC]) data[idx][n.DURATION] = duration * 1000 except Exception as e: print(record, e)
def run(self, matches_path, out_matches_path): self.matches = t.load_pickle(matches_path) self.out_matches_path = out_matches_path self.file2mfcc = FileToMFCC(workdir) # self.file2video_vector = FileToVideoVector(workdir) process_list = [] for ida, match in self.matches.items(): for idb, match_item in match.idb_items.items(): process_list.append((ida, idb)) random.shuffle(process_list) for idx in tqdm(range(len(process_list)), ascii=True, desc='process {} lines'.format(len(process_list))): idy, ide = process_list[idx] y_info = t.load_json(os.path.join(workdir, idy, 'info.json')) e_info = t.load_json(os.path.join(workdir, ide, 'info.json')) if None in [y_info, e_info]: print('ERR', idy, ide) continue y_a_path, y_v_path = self.get_vectors(idy, y_info) e_a_path, e_v_path = self.get_vectors(ide, e_info)
def run(self): self.yt_list = t.load_json(self.youtube_path) self.ex_list = t.load_json(self.exports_path) self.youtube_to_process, self.export_to_process = [], [] print('==> prepare_in_out_list') self.prepare_in_out() self.youtube_to_process_path = os.path.join(self.workdir, 'youtube_to_process.json') self.export_to_process_path = os.path.join(self.workdir, 'export_to_process.json') # print('==> read youtube durations') # self.read_durations(self.youtube_to_process) # t.save_json(self.youtube_to_process_path, self.youtube_to_process) print('==> read export durations') self.read_durations(self.export_to_process) t.save_json(self.export_to_process_path, self.export_to_process) return self.youtube_to_process_path, self.export_to_process_path
def run(self): done = 0 counter = 0 for ida, match in tqdm(self.matches.items(), ascii=True, desc='process {} lines'.format(len(self.matches))): counter += 1 try: if len(self.matches[ida].mse): continue except: self.matches[ida].mse = [] a_path = os.path.join(self.workdir, match.ida, 'image.png') a_info_path = os.path.join(self.workdir, match.ida, 'info.json') if not os.path.isfile(a_path) or not os.path.isfile(a_info_path): print('cannot read', a_path) continue A = cv2.imread(a_path) A_info = t.load_json(a_info_path) if len([ret for ret in A_info['rets'] if not ret]) > 2: # print(A_info['rets']) continue if len(match.diff): for idb in match.idb: b_path = os.path.join(self.workdir, idb, 'image.png') b_info_path = os.path.join(self.workdir, idb, 'info.json') if not os.path.isfile(b_path) or not os.path.isfile(b_info_path): print('cannot read', b_path) continue B = cv2.imread(b_path) B_info = t.load_json(b_info_path) if len([ret for ret in B_info['rets'] if not ret]) > 2: # print(B_info['rets']) continue mse = ((A - B) ** 2).mean(axis=None) match.mse.append((idb, mse)) done += 1 # diffs.append(len(match.diff)) if counter % 100 == 0: print('data_saved') t.save_pickle(self.path_out, self.matches) t.save_pickle(self.path_out, self.matches) print('data_saved') print('done {}/{}'.format(done, len(self.matches)))
def get_info(self, id): info_path = os.path.join(self.workdir, id, 'info.json') info = t.load_json(info_path) if info is None: print('cannot read ', info_path) return info
# ok += 1 # # new_list_to_process.append(all_youtube_dict_by_id[y_id_pop]) # print('{}/{}'.format(ok, len(y_popular_id_list))) # t.save_json('/mnt/data/palpatine/DATASETS/YT_LINK/100_popular_from_2017_to_process.json', new_list_to_process) ###################################################################################################################3 from api.tools import Tools as t from tqdm import tqdm import os y = '/mnt/data/palpatine/DATASETS/YT_LINK/workdir/youtube_to_process.json' e = '/mnt/data/palpatine/DATASETS/YT_LINK/workdir/export_to_process.json' workdir = '/mnt/data/palpatine/DATASETS/YT_LINK/workdir' all = t.load_json(e) + t.load_json(y) video_exts = ['.MP4', '.WEBM', '.MKV', '.MOV'] cnt = 0 sizes = 0 for idx in tqdm(range(len(all)), ascii=True, desc='process {} lines'.format(len(all))): item = all[idx] dst = item['dst'] folderpath = os.path.join(workdir, dst) listdir = os.listdir(folderpath) # for name in listdir: # sizes += os.path.getsize(os.path.join(folderpath, name)) if 'info.json' not in listdir: cnt += 1
from api.tools import Tools as t import random ######### # path = '100_id_list.txt' yt_to_process = t.load_json('/mnt/data/palpatine/DATASETS/YT_LINK/workdir/youtube_to_process.json') yt_to_process_100 = '/mnt/data/palpatine/DATASETS/YT_LINK/workdir/youtube_to_process_100.json' # file1 = open(path, 'r') # lines = file1.read().splitlines() # id_list = [] unknown = 0 # dct_yt_list = {} # for item in yt_to_process: # dct_yt_list[item['dst']] = item random.shuffle(yt_to_process) # y100 = [] # for line in lines: # key = 'youtube/'+ line.split(',')[-1] # if key not in dct_yt_list: # unknown +=1 # else: # dct_yt_list[key]['year']=line.split(',')[-2] # dct_yt_list[key]['id']=line.split(',')[0] # y100.append(dct_yt_list[key]) y100 = yt_to_process[:100] print('unknown', unknown) print('new', len(y100)) t.save_json(yt_to_process_100, y100)
np.abs(np.array(A, dtype=np.int16) - np.array(B, dtype=np.int16)), dtype=np.uint8) res = (640, 360) A = cv2.resize(A, res) B = cv2.resize(B, res) C = cv2.resize(C, res) cv2.imshow('youtube', A) cv2.imshow('exports', B) cv2.imshow('diff', C) cv2.waitKey(0) time.sleep(0.2) matches = t.load_pickle(path) ok = 0 nomse = 0 multiple_mse = 0 deleted = 0 to_delete = [] # youtube_to_process_with_year = t.load_json('/mnt/data/palpatine/DATASETS/YT_LINK/workdir/youtube_to_process_100.json') dct_yt_list = {} # for item in youtube_to_process_with_year: # dct_yt_list[item['dst']] = item years = {} for idx, (ida, match) in enumerate(matches.items()): ida_info_path = os.path.join(workdir, ida, 'info.json') if not os.path.isfile(ida_info_path):
from file_to_mfcc import FileToMFCC from file_to_video_vector import FileToVideoVector from api.offset_calc import OffsetCalc from tqdm import tqdm # from match_by_duration import MatchByDuration import random import shutil MIN_CORR_SCORE = 0 a_filename = 'distances.npy' v_filename = 'video_vector.npy' OVERWRITE_A = False OVERWRITE_V = False y = t.load_json( '/mnt/data/palpatine/DATASETS/YT_LINK/workdir/youtube_to_process.json') e = t.load_json( '/mnt/data/palpatine/DATASETS/YT_LINK/workdir/export_to_process.json') workdir = '/mnt/data/palpatine/DATASETS/YT_LINK/workdir' part_num = 5 print('part_num', part_num) whole_list = y + e # random.shuffle(whole_list) step = len(whole_list) // 6 parts = [] for idx in range(6): from_to = [idx * step, (idx + 1) * step - 1] if idx == 5:
workdir = '/mnt/data/palpatine/DATASETS/YT_LINK/workdir' # # exports_list_path = '/mnt/data/palpatine/SAMPLES/YT_LINK/sample/sources_a.json' # yt_list_path = '/mnt/data/palpatine/SAMPLES/YT_LINK/sample/sources_b.json' # workdir = '/mnt/data/palpatine/SAMPLES/YT_LINK/sample/workdir' # exports_list = '/mnt/data/palpatine/SAMPLES/YT_LINK/REF/list_exports_test.json' # yt_list = '/mnt/data/palpatine/SAMPLES/YT_LINK/REF/list_youtube_test.json' # exports_list = '/mnt/data/palpatine/SAMPLES/YT_LINK/REF/list_exports.json' # yt_list = '/mnt/data/palpatine/SAMPLES/YT_LINK/REF/list_youtube.json' # # workdir = '/mnt/data/palpatine/SAMPLES/YT_LINK/REF/workdir' test_len = 0 ss = 0 DIFF_LIMIT = 1000 workdir = t.create_folder(workdir) t_start = time.time() print('\nCREATE PROCESS POOL AND IMAGES') vl = CreateProcessPool(youtube_path=yt_list_path, exports_path=exports_list_path, workdir=workdir, test_len=test_len, ss=ss) youtube_to_process_path, export_to_process_path = vl.run() # CreateImages().process(youtube_to_process_path, export_to_process_path, workdir) print('\nMATCH BY DURATION') m = MatchByDuration(youtube_path=youtube_to_process_path, export_path=export_to_process_path, workdir=workdir, diff_limit=DIFF_LIMIT) m.get_duration_lists() print('done in {:.02f} s'.format(time.time() - t_start))
def __init__(self, workdir, matches_path, path_out): self.matches = t.load_pickle(matches_path) self.workdir = workdir self.path_out = path_out
from api.tools import Tools as t y_popular_id_list = t.load_json( '/mnt/data/palpatine/DATASETS/YT_LINK/100_popular_from_2017.json') y_list = t.load_json( '/mnt/data/palpatine/DATASETS/YT_LINK/workdir/youtube_to_process.json') all_youtube_dict_by_id = {} for item in y_list: y_id = item['src'].split('/')[-1].split('.')[0] all_youtube_dict_by_id[y_id] = item new_list_to_process = [] ok = 0 for y_id_pop in y_popular_id_list: if y_id_pop in all_youtube_dict_by_id: ok += 1 new_list_to_process.append(all_youtube_dict_by_id[y_id_pop]) print('{}/{}'.format(ok, len(y_popular_id_list))) t.save_json( '/mnt/data/palpatine/DATASETS/YT_LINK/100_popular_from_2017_to_process.json', new_list_to_process)
from api.tools import Tools as t import csv path = '/mnt/data/palpatine/SAMPLES/YT_LINK/Vimeo migration - DONE - data-3_2019-Veronika.csv' yt_path = '/mnt/data/palpatine/DATASETS/YT_LINK/REF/sources_youtube.json' ex_path = '/mnt/data/palpatine/DATASETS/YT_LINK/REF/sources_exports.json' youtube_entire_video_list = '/mnt/data/palpatine/SAMPLES/YT_LINK/youtube_listdir_video.json' youtube_entire_video_list = t.load_json(youtube_entire_video_list) file=open( path, "r") reader = csv.reader(file) youtube_list = [] exports_list = [] for line in reader: export, yt_link=line[3],line[2] if len(export.split('/')) == 1: continue if not len(export) or not len(yt_link): continue if 'vimeo' in yt_link.lower(): # print(output) continue else: yt_id = yt_link.rsplit('/')[-1] yt_video_path = None for video in youtube_entire_video_list: if yt_id in video: yt_video_path= video