Example #1
0
    def __init__(self, workdir, sr=16000, max_length=1200):

        self.workdir = workdir
        self.sr = sr
        # self.max_length = max_length
        # self.abs_dist_path =t.create_folder(os.path.join(workdir, 'distances'))
        self.youtube_list = t.load_json(youtube_path)
        self.exports_list = t.load_json(exports_path)
Example #2
0
    def run(self, src, distances_path):

        folder_path = os.path.dirname(distances_path)
        try:
            t.create_folder(folder_path)
            distances = self.compute_dist(src)
            if distances is None:
                return None
            t.save_array(distances, distances_path)
            return distances_path
        except Exception as e:
            print(folder_path, e)
            return None
Example #3
0
    def process(self, youtube_path, exports_path, workdir):

        yt_list = t.load_json(youtube_path)
        ex_list = t.load_json(exports_path)
        self.test_len = 0
        self.run_from = 41609
        self.timestamp = time.time()
        self.list_to_process = yt_list + ex_list

        self.err_file = os.path.join(workdir, 'errors.json')

        run_to = self.run_from + self.test_len if self.test_len != 0 else len(
            self.list_to_process)

        errors = []
        for idx in tqdm(range(run_to),
                        ascii=True,
                        desc='process {} links'.format(run_to)):
            if idx < self.run_from:
                continue
            record = self.list_to_process[idx]
            path, out_path_rel = record['src'], record['dst']
            out_path = t.create_folder(os.path.join(workdir, out_path_rel))

            name = 'image.png'
            name_info = 'info.json'
            out = os.path.join(out_path, name)
            out_info = os.path.join(out_path, name_info)
            if os.path.isfile(out):
                continue

            try:
                # print(f_idx)
                frame, info = self.get_full_frame(path)
                info['src'] = path
                # np.save(out, frame)
                cv2.imwrite(out, frame)
                with open(out_info, 'w') as outfile:
                    json.dump(info, outfile, indent=4)

            except Exception as e:
                err = '{}: {}'.format(path, e)
                print(err)
                errors.append(err)

            # print(info)
        if len(errors):
            with open(self.err_file, 'w') as outfile:
                json.dump(errors, outfile, indent=4)
Example #4
0
    def run(self, src, vector_path):

        folder_path = os.path.dirname(vector_path)
        # try:
        t.create_folder(folder_path)

        # tmp_video_path = os.path.join(self.workdir, dst, os.path.basename(src))
        # shutil.copy(src, tmp_video_path)
        video_vector = self.compute_vector(src)
        t.save_array(video_vector, vector_path)
        # return vector_path
        # except Exception as e:
        #     print(folder_path, e)
        #     return None
        return vector_path
Example #5
0
 def read_durations(self, data):
     for idx in tqdm(range(len(data)),
                     ascii=True,
                     desc='ffprobe on {} files'.format(len(data))):
         record = data[idx]
         try:
             duration = t.get_duration(record[n.SRC])
             data[idx][n.DURATION] = duration * 1000
         except Exception as e:
             print(record, e)
Example #6
0
    def run(self, matches_path, out_matches_path):
        self.matches = t.load_pickle(matches_path)
        self.out_matches_path = out_matches_path
        self.file2mfcc = FileToMFCC(workdir)
        # self.file2video_vector = FileToVideoVector(workdir)
        process_list = []
        for ida, match in self.matches.items():
            for idb, match_item in match.idb_items.items():
                process_list.append((ida, idb))
        random.shuffle(process_list)

        for idx in tqdm(range(len(process_list)),
                        ascii=True,
                        desc='process {} lines'.format(len(process_list))):
            idy, ide = process_list[idx]
            y_info = t.load_json(os.path.join(workdir, idy, 'info.json'))
            e_info = t.load_json(os.path.join(workdir, ide, 'info.json'))
            if None in [y_info, e_info]:
                print('ERR', idy, ide)
                continue
            y_a_path, y_v_path = self.get_vectors(idy, y_info)
            e_a_path, e_v_path = self.get_vectors(ide, e_info)
Example #7
0
    def run(self):

        self.yt_list = t.load_json(self.youtube_path)
        self.ex_list = t.load_json(self.exports_path)
        self.youtube_to_process, self.export_to_process = [], []
        print('==> prepare_in_out_list')
        self.prepare_in_out()

        self.youtube_to_process_path = os.path.join(self.workdir,
                                                    'youtube_to_process.json')
        self.export_to_process_path = os.path.join(self.workdir,
                                                   'export_to_process.json')

        # print('==> read youtube durations')
        # self.read_durations(self.youtube_to_process)
        # t.save_json(self.youtube_to_process_path, self.youtube_to_process)

        print('==> read export durations')
        self.read_durations(self.export_to_process)
        t.save_json(self.export_to_process_path, self.export_to_process)

        return self.youtube_to_process_path, self.export_to_process_path
Example #8
0
    def run(self):

        done = 0
        counter = 0
        for ida, match in tqdm(self.matches.items(), ascii=True, desc='process {} lines'.format(len(self.matches))):
            counter += 1


            try:
                if len(self.matches[ida].mse):
                    continue
            except:
                self.matches[ida].mse = []
            a_path = os.path.join(self.workdir, match.ida, 'image.png')
            a_info_path = os.path.join(self.workdir, match.ida, 'info.json')
            if not os.path.isfile(a_path) or not os.path.isfile(a_info_path):
                print('cannot read', a_path)
                continue
            A = cv2.imread(a_path)
            A_info = t.load_json(a_info_path)
            if len([ret for ret in A_info['rets'] if not ret]) > 2:
                # print(A_info['rets'])
                continue

            if len(match.diff):
                for idb in match.idb:
                    b_path = os.path.join(self.workdir, idb, 'image.png')
                    b_info_path = os.path.join(self.workdir, idb, 'info.json')
                    if not os.path.isfile(b_path) or not os.path.isfile(b_info_path):
                        print('cannot read', b_path)
                        continue
                    B = cv2.imread(b_path)
                    B_info = t.load_json(b_info_path)
                    if len([ret for ret in B_info['rets'] if not ret]) > 2:
                        # print(B_info['rets'])
                        continue

                    mse = ((A - B) ** 2).mean(axis=None)
                    match.mse.append((idb, mse))
                    done += 1

                    # diffs.append(len(match.diff))

            if counter % 100 == 0:
                print('data_saved')
                t.save_pickle(self.path_out, self.matches)
        t.save_pickle(self.path_out, self.matches)
        print('data_saved')
        print('done {}/{}'.format(done, len(self.matches)))
Example #9
0
 def get_info(self, id):
     info_path = os.path.join(self.workdir, id, 'info.json')
     info = t.load_json(info_path)
     if info is None:
         print('cannot read ', info_path)
     return info
Example #10
0
#         ok += 1
#
#         new_list_to_process.append(all_youtube_dict_by_id[y_id_pop])
# print('{}/{}'.format(ok, len(y_popular_id_list)))
# t.save_json('/mnt/data/palpatine/DATASETS/YT_LINK/100_popular_from_2017_to_process.json', new_list_to_process)

###################################################################################################################3

from api.tools import Tools as t
from tqdm import tqdm
import os

y = '/mnt/data/palpatine/DATASETS/YT_LINK/workdir/youtube_to_process.json'
e = '/mnt/data/palpatine/DATASETS/YT_LINK/workdir/export_to_process.json'
workdir = '/mnt/data/palpatine/DATASETS/YT_LINK/workdir'
all = t.load_json(e) + t.load_json(y)

video_exts = ['.MP4', '.WEBM', '.MKV', '.MOV']
cnt = 0
sizes = 0
for idx in tqdm(range(len(all)),
                ascii=True,
                desc='process {} lines'.format(len(all))):
    item = all[idx]
    dst = item['dst']
    folderpath = os.path.join(workdir, dst)
    listdir = os.listdir(folderpath)
    # for name in listdir:
    #     sizes += os.path.getsize(os.path.join(folderpath, name))
    if 'info.json' not in listdir:
        cnt += 1
Example #11
0
from api.tools import Tools as t
import random
#########
# path = '100_id_list.txt'
yt_to_process = t.load_json('/mnt/data/palpatine/DATASETS/YT_LINK/workdir/youtube_to_process.json')
yt_to_process_100 = '/mnt/data/palpatine/DATASETS/YT_LINK/workdir/youtube_to_process_100.json'

# file1 = open(path, 'r')
# lines = file1.read().splitlines()
# id_list = []
unknown = 0

# dct_yt_list = {}
# for item in yt_to_process:
#     dct_yt_list[item['dst']] = item

random.shuffle(yt_to_process)
# y100 = []
# for line in lines:
#     key = 'youtube/'+ line.split(',')[-1]
#     if key not in dct_yt_list:
#         unknown +=1
#     else:
#         dct_yt_list[key]['year']=line.split(',')[-2]
#         dct_yt_list[key]['id']=line.split(',')[0]
#         y100.append(dct_yt_list[key])
y100 = yt_to_process[:100]
print('unknown', unknown)
print('new', len(y100))
t.save_json(yt_to_process_100, y100)
Example #12
0
        np.abs(np.array(A, dtype=np.int16) - np.array(B, dtype=np.int16)),
        dtype=np.uint8)

    res = (640, 360)
    A = cv2.resize(A, res)
    B = cv2.resize(B, res)
    C = cv2.resize(C, res)

    cv2.imshow('youtube', A)
    cv2.imshow('exports', B)
    cv2.imshow('diff', C)
    cv2.waitKey(0)
    time.sleep(0.2)


matches = t.load_pickle(path)
ok = 0
nomse = 0
multiple_mse = 0
deleted = 0
to_delete = []

# youtube_to_process_with_year = t.load_json('/mnt/data/palpatine/DATASETS/YT_LINK/workdir/youtube_to_process_100.json')
dct_yt_list = {}
# for item in youtube_to_process_with_year:
#     dct_yt_list[item['dst']] = item

years = {}
for idx, (ida, match) in enumerate(matches.items()):
    ida_info_path = os.path.join(workdir, ida, 'info.json')
    if not os.path.isfile(ida_info_path):
Example #13
0
from file_to_mfcc import FileToMFCC
from file_to_video_vector import FileToVideoVector
from api.offset_calc import OffsetCalc
from tqdm import tqdm
# from match_by_duration import MatchByDuration
import random
import shutil

MIN_CORR_SCORE = 0
a_filename = 'distances.npy'
v_filename = 'video_vector.npy'

OVERWRITE_A = False
OVERWRITE_V = False

y = t.load_json(
    '/mnt/data/palpatine/DATASETS/YT_LINK/workdir/youtube_to_process.json')
e = t.load_json(
    '/mnt/data/palpatine/DATASETS/YT_LINK/workdir/export_to_process.json')
workdir = '/mnt/data/palpatine/DATASETS/YT_LINK/workdir'

part_num = 5
print('part_num', part_num)
whole_list = y + e

# random.shuffle(whole_list)
step = len(whole_list) // 6
parts = []

for idx in range(6):
    from_to = [idx * step, (idx + 1) * step - 1]
    if idx == 5:
Example #14
0
workdir = '/mnt/data/palpatine/DATASETS/YT_LINK/workdir'

#
# exports_list_path = '/mnt/data/palpatine/SAMPLES/YT_LINK/sample/sources_a.json'
# yt_list_path = '/mnt/data/palpatine/SAMPLES/YT_LINK/sample/sources_b.json'
# workdir = '/mnt/data/palpatine/SAMPLES/YT_LINK/sample/workdir'



# exports_list = '/mnt/data/palpatine/SAMPLES/YT_LINK/REF/list_exports_test.json'
# yt_list = '/mnt/data/palpatine/SAMPLES/YT_LINK/REF/list_youtube_test.json'
# exports_list = '/mnt/data/palpatine/SAMPLES/YT_LINK/REF/list_exports.json'
# yt_list = '/mnt/data/palpatine/SAMPLES/YT_LINK/REF/list_youtube.json'
#
# workdir = '/mnt/data/palpatine/SAMPLES/YT_LINK/REF/workdir'
test_len = 0
ss = 0
DIFF_LIMIT = 1000


workdir = t.create_folder(workdir)
t_start = time.time()
print('\nCREATE PROCESS POOL AND IMAGES')
vl = CreateProcessPool(youtube_path=yt_list_path, exports_path=exports_list_path, workdir=workdir, test_len=test_len, ss=ss)
youtube_to_process_path, export_to_process_path = vl.run()
# CreateImages().process(youtube_to_process_path, export_to_process_path, workdir)
print('\nMATCH BY DURATION')
m = MatchByDuration(youtube_path=youtube_to_process_path, export_path=export_to_process_path, workdir=workdir, diff_limit=DIFF_LIMIT)
m.get_duration_lists()

print('done in {:.02f} s'.format(time.time() - t_start))
Example #15
0
 def __init__(self, workdir, matches_path, path_out):
     self.matches = t.load_pickle(matches_path)
     self.workdir = workdir
     self.path_out = path_out
from api.tools import Tools as t

y_popular_id_list = t.load_json(
    '/mnt/data/palpatine/DATASETS/YT_LINK/100_popular_from_2017.json')
y_list = t.load_json(
    '/mnt/data/palpatine/DATASETS/YT_LINK/workdir/youtube_to_process.json')

all_youtube_dict_by_id = {}

for item in y_list:
    y_id = item['src'].split('/')[-1].split('.')[0]
    all_youtube_dict_by_id[y_id] = item

new_list_to_process = []
ok = 0
for y_id_pop in y_popular_id_list:
    if y_id_pop in all_youtube_dict_by_id:
        ok += 1

        new_list_to_process.append(all_youtube_dict_by_id[y_id_pop])
print('{}/{}'.format(ok, len(y_popular_id_list)))
t.save_json(
    '/mnt/data/palpatine/DATASETS/YT_LINK/100_popular_from_2017_to_process.json',
    new_list_to_process)
Example #17
0
from api.tools import Tools as t
import csv

path = '/mnt/data/palpatine/SAMPLES/YT_LINK/Vimeo migration - DONE - data-3_2019-Veronika.csv'

yt_path = '/mnt/data/palpatine/DATASETS/YT_LINK/REF/sources_youtube.json'
ex_path = '/mnt/data/palpatine/DATASETS/YT_LINK/REF/sources_exports.json'
youtube_entire_video_list = '/mnt/data/palpatine/SAMPLES/YT_LINK/youtube_listdir_video.json'
youtube_entire_video_list = t.load_json(youtube_entire_video_list)
file=open( path, "r")
reader = csv.reader(file)
youtube_list = []
exports_list = []
for line in reader:
    export, yt_link=line[3],line[2]
    if len(export.split('/')) == 1:
        continue
    if not len(export) or not len(yt_link):
        continue

    if 'vimeo' in yt_link.lower():
        # print(output)
        continue
    else:
        yt_id = yt_link.rsplit('/')[-1]

    yt_video_path = None
    for video in youtube_entire_video_list:
        if yt_id in video:
            yt_video_path= video