args = parser.parse_args() # Load model if args.multi: args.arch = 'multi_resnet3d50' model = models.load_model(args.arch) # Get dataset categories if args.multi: categories = models.load_categories('category_multi_momentsv2.txt') else: categories = models.load_categories('category_momentsv2.txt') # Load the video frame transform transform = models.load_transform() # Obtain video frames if args.frame_folder is not None: print('Loading frames in {}'.format(args.frame_folder)) import glob # here make sure after sorting the frame paths have the correct temporal order frame_paths = sorted(glob.glob(os.path.join(args.frame_folder, '*.jpg'))) frames = load_frames(frame_paths) else: print('Extracting frames using ffmpeg...') frames = extract_frames(args.video_file, args.num_segments) # Prepare input tensor if 'resnet3d50' in args.arch: # [1, num_frames, 3, 224, 224]
def load_video(video_hash): yt = YouTube('https://youtube.com/embed/%s?start=%d&end=%d' % (video_hash, start, end)) video = yt.streams.all()[0] name = video.download('/tmp') # Load model model = models.load_model(arch) av_categories = pd.read_csv('CVS_Actions(NEW).csv', delimiter=';').values.tolist() trax = pd.read_csv('audioTracks_urls.csv') # Get dataset categories #categories = models.load_categories() # Load the video frame transform transform = models.load_transform() # Obtain video frames if frame_folder is not None: print('Loading frames in {}'.format(frame_folder)) import glob # here make sure after sorting the frame paths have the correct temporal order frame_paths = sorted(glob.glob(os.path.join(frame_folder, '*.jpg'))) print(frame_paths) frames = load_frames(frame_paths) else: print('Extracting frames using ffmpeg...') frames = extract_frames(name, num_segments) # Prepare input tensor if arch == 'resnet3d50': # [1, num_frames, 3, 224, 224] input = torch.stack([transform(frame) for frame in frames], 1).unsqueeze(0) else: # [num_frames, 3, 224, 224] input = torch.stack([transform(frame) for frame in frames]) # Make video prediction with torch.no_grad(): logits = model(input) h_x = F.softmax(logits, 1).mean(dim=0) probs, idx = h_x.sort(0, True) # Output the prediction. print('RESULT ON ' + name) y = float(av_categories[idx[0]][1]) * 125 x = float(av_categories[idx[0]][2]) * 125 trax = trax.assign( dist=lambda row: np.sqrt((x - row.valence)**2 + (y - row.energy)**2)) print('min', trax['dist'].min()) best = trax.nsmallest(100, 'dist') print(best) rand = randint(0, 9) print(rand) choice = best.iloc[rand, [1, 2, 5]] print('choice', choice) song = 'valence: ' + str(x) + ' arousal: ' + str( y) + " " + choice[0] + ' ' + choice[1] print(song) print(x, y) for i in range(0, 5): print('{:.3f} -> {} ->{}'.format(probs[i], idx[i], av_categories[idx[i]])) print('result cutegories', av_categories[idx[i]][0], av_categories[idx[i]][1]) #r = requests.get(match.iloc[0,2], allow_redirects=True) r = requests.get(choice[2], allow_redirects=True) open('./tmp/preview.mp3', 'wb').write(r.content) # Render output frames with prediction text. rendered_output = './tmp/' + video_hash + '_' + str(x) + '_' + str( y) + '.mp4' if rendered_output is not None: clip = VideoFileClip(name).subclip(30, 60) audioclip = AudioFileClip('./tmp/preview.mp3') txt_clip = TextClip(song, fontsize=16, color='white') clip_final = clip.set_audio(audioclip) video = CompositeVideoClip([clip_final, txt_clip]) video.set_duration(30).write_videofile(rendered_output)