def beat_extractor(queue_beat): kwargs = dict( fps=100, correct=True, infile=None, outfile=None, max_bpm=170, min_bpm=60, #nn_files = [BEATS_LSTM[0]], transition_lambda=100, num_frames=1, online=True, verbose=1) def beat_callback(beats, output=None): if len(beats) > 0: # Do something with the beat (for now, just print the array to stdout) queue_beat.put(beats[0]) #print(beats) #print('Process to write betas: %s' % os.getpid()) in_processor = RNNBeatProcessor(**kwargs) beat_processor = DBNBeatTrackingProcessor(**kwargs) out_processor = [beat_processor, beat_callback] processor = IOProcessor(in_processor, out_processor) process_online(processor, **kwargs)
def get_beat_processor(): print('START BEAT PROCESSOR >> ', str(datetime.now())) from madmom.features.beats import RNNBeatProcessor, DBNBeatTrackingProcessor from madmom.processors import SequentialProcessor print('BEAT PROCESSOR >> ', str(datetime.now())) return SequentialProcessor( [RNNBeatProcessor(), DBNBeatTrackingProcessor(fps=100)])
def beatSyncFeature(feature, audio, sr, hop_length): # Aggregate feature between beat events fps = SR / HOP_LENGTH beat_proc = DBNBeatTrackingProcessor(fps=100) beat_act = RNNBeatProcessor()(audio) beat_times = beat_proc(beat_act) # We'll use the median value of each feature between beat frames feature = librosa.feature.sync(feature, (beat_times * fps).astype(int), aggregate=np.median) return feature, beat_times
def get_beats(file_path: str) -> List[float]: """ Given the path to an audio file get a list of detected beat timings (in seconds) """ print(f"Getting beats for {file_path}") proc = DBNBeatTrackingProcessor(fps=100) act = RNNBeatProcessor()(file_path) res: List[float] = proc(act) print(f"Got {len(res)} beats") print(res) return res
def getRNNDBNOnsets(filename): """ Call Madmom's implementation of RNN + DBN beat tracking :param filename: Path to audio file """ print("Computing madmom beats...") from madmom.features.beats import RNNBeatProcessor, DBNBeatTrackingProcessor proc = DBNBeatTrackingProcessor(fps=100) act = RNNBeatProcessor()(filename) b = proc(act) return b
def extract(yt_id): beats = SequentialProcessor( [RNNBeatProcessor(), DBNBeatTrackingProcessor(fps=100)]) chordrec = SequentialProcessor( [CNNChordFeatureProcessor(), CRFChordRecognitionProcessor()]) processMulti = ParallelProcessor([]) processMulti.append(beats) processMulti.append(chordrec) beatSync = SequentialProcessor( [printTime, processMulti, printTime, arrange, printTime]) return beatSync('tmp/' + yt_id + '.wav')
def madmom_features(self, fps=100): """ Call Madmom's implementation of RNN + DBN beat tracking. Madmom's results are returned in terms of seconds, but round and convert to be in terms of hop_size so that they line up with the features. The novelty function is also computed as a side effect (and is the bottleneck in the computation), so also return that Parameters ---------- fps: int Frames per second in processing Returns ------- { 'tempos': ndarray(n_levels, 2) An array of tempo estimates in beats per minute, along with their confidences 'onsets': ndarray(n_onsets) Array of onsets, where each onset indexes into a particular window 'novfn': ndarray(n_frames) Evaluation of the rnn audio novelty function at each audio frame, in time increments equal to self.hop_length 'snovfn': ndarray(n_frames) Superflux audio novelty function at each audio frame, in time increments equal to self.hop_length } """ from madmom.features.beats import RNNBeatProcessor, DBNBeatTrackingProcessor from madmom.features.tempo import TempoEstimationProcessor from madmom.features.onsets import SpectralOnsetProcessor from madmom.audio.filters import LogarithmicFilterbank beatproc = DBNBeatTrackingProcessor(fps=fps) tempoproc = TempoEstimationProcessor(fps=fps) novfn = RNNBeatProcessor()(self.audio_file) # This step is the computational bottleneck beats = beatproc(novfn) tempos = tempoproc(novfn) onsets = np.array(np.round(beats*self.fs/float(self.hop_length)), dtype=np.int64) # Resample the audio novelty function to correspond to the # correct hop length nframes = len(self.librosa_noveltyfn()) novfn = np.interp(np.arange(nframes)*self.hop_length/float(self.fs), np.arange(len(novfn))/float(fps), novfn) # For good measure, also compute and return superflux sodf = SpectralOnsetProcessor(onset_method='superflux', fps=fps, \ filterbank=LogarithmicFilterbank,\ num_bands=24, log=np.log10) snovfn = sodf(self.audio_file) snovfn = np.interp(np.arange(nframes)*self.hop_length/float(self.fs), np.arange(len(snovfn))/float(fps), snovfn) return {'tempos':tempos, 'onsets':onsets, 'novfn':novfn, 'snovfn':snovfn}
def getRNNDBNOnsets(filename, Fs, hopSize): """ Call Madmom's implementation of RNN + DBN beat tracking :param filename: Path to audio file :param Fs: Sample rate :param hopSize: Hop size of each onset function value :returns (tempo, beats): Average tempo, numpy array of beat intervals in seconds """ print("Computing madmom beats...") from madmom.features.beats import RNNBeatProcessor, DBNBeatTrackingProcessor proc = DBNBeatTrackingProcessor(fps=100) act = RNNBeatProcessor()(filename) b = proc(act) tempo = 60 / np.mean(b[1::] - b[0:-1]) beats = np.array(np.round(b * Fs / hopSize), dtype=np.int64) return (tempo, beats)
def chordBeats(infile, outfile): print 'Loading audio file...', infile #proc = BeatTrackingProcessor( # fps = 100, # method='comb', min_bpm=40, # max_bpm=240, act_smooth=0.09, # hist_smooth=7, alpha=0.79) proc = DBNBeatTrackingProcessor(fps=100, method='comb', min_bpm=40, max_bpm=240) act = RNNBeatProcessor()(infile) beats = proc(act).astype('float32') audio = essentia.standard.MonoLoader(filename=infile)() # TODO: best partameters. parameters = {} stepsize, semitones = vamp.collect(audio, 44100, "nnls-chroma:nnls-chroma", output="semitonespectrum", step_size=2048)["matrix"] np.savez(outfile, [len(audio)], beats, semitones)
def __init__(self): self.pa = pyaudio.PyAudio() self.c_count = 0 using_callback = True self.buffer = collections.deque(maxlen=self.RATE * 14) self.rnn = RNNBeatProcessor(online=True, nn_files=[BEATS_LSTM[0]]) self.act_proc = DBNBeatTrackingProcessor(fps=100, min_bpm=80.0, max_bpm=180.0) self.dcp = DeepChromaProcessor() self.decode = DeepChromaChordRecognitionProcessor() self.start_current_time = None if using_callback: self.stream = self.pa.open(format=self.FORMAT, channels=self.CHANNELS, rate=self.RATE, input=True, output=True, frames_per_buffer=self.CHUNK, stream_callback=self.callback) print(self.pa.get_default_output_device_info()) print(self.pa.get_default_input_device_info()) self.t_start = time.time() beepsnd, _ = librosa.load('block.wav', sr=None) out1 = (beepsnd).tostring() #print(beepsnd.size, len(out1)) self.beepsnd = out1 self.Flag = False self.beep_count = 0 while self.stream.is_active(): if len(self.buffer) == self.RATE * 8: print('14 sec') print(self.time_info) print(time.time() - self.t_start) self.tmp = np.array(self.buffer) self.buffer.clear() print(time.time() - self.t_start) chroma_thread = threading.Thread(target=self.chroma_rec, args=()) chroma_thread.start() #chord = chroma_thread.run() tmp2 = self.rnn(self.tmp) # tmp2 = librosa.onset.onset_strength(tmp,sr=self.RATE, hop_length = int(self.RATE / 100),max_size=1,aggregate=np.median, n_mels=256) # tmp2 /= np.max(tmp2) #t_axes = librosa.frames_to_time(np.arange(len(tmp2)),sr=self.RATE) t_proc = time.time() - self.t_start print(t_proc) tmp3_2 = self.act_proc(tmp2) tmp3_1 = 60 / np.mean(np.diff(tmp3_2)) # print(tmp3) #tmp3_1,tmp3_2 = librosa.beat.beat_track(onset_envelope=tmp2, sr=self.RATE) print('tempo is %f' % tmp3_1) print('beat is ', tmp3_2) t_proc = time.time() - self.t_start chroma_thread.join() print(t_proc) t = threading.Timer(60. / tmp3_1 - t_proc, self.flagit, ()) t.daemon = True t.start() # self.stream.write(self.beepsnd) print(time.time() - self.t_start) else: time.sleep(0.001) else: self.stream = self.pa.open(format=self.FORMAT, channels=self.CHANNELS, rate=self.RATE, input=True, output=True, frames_per_buffer=self.CHUNK) self.t_start = time.time() self.loop()
def activation2downbeat(activation, fps=100): return DBNBeatTrackingProcessor(min_bpm=16.0, max_bpm=55.0, fps=100)(activation)
def __init__(self): self.pa = pyaudio.PyAudio() self.c_count = 0 using_callback = True self.buffer = collections.deque(maxlen=self.RATE * 14) self.rnn = RNNBeatProcessor(online=True, nn_files=[BEATS_LSTM[0]]) self.act_proc = DBNBeatTrackingProcessor(fps=100, min_bpm=80.0, max_bpm=180.0) self.dcp = DeepChromaProcessor() self.decode = DeepChromaChordRecognitionProcessor() self.start_current_time = None self.beep_count = 0 source_path = 'tool' style_name = 'test_midi_folder' self.test = InstScheduler(FoxDot.lib.Clock, source_path) self.test.AddMidiFolder(style_name) self.test.Live_event( ) # Online random playing event determined by prosperity function self.test.set_tempo_pattern( 4, 4 ) # if the meta file is exist, calling this routine is not required if using_callback: self.stream = self.pa.open(format=self.FORMAT, channels=self.CHANNELS, rate=self.RATE, input=True, output=True, frames_per_buffer=self.CHUNK, stream_callback=self.callback) print(self.pa.get_default_output_device_info()) print(self.pa.get_default_input_device_info()) self.t_start = time.time() beepsnd, _ = librosa.load('block.wav', sr=None) out1 = (beepsnd).tostring() #print(beepsnd.size, len(out1)) self.beepsnd = out1 self.Flag = False while self.stream.is_active(): if len(self.buffer) == self.RATE * 14: print('14 sec') print(self.time_info) print(time.time() - self.t_start) self.tmp = np.array(self.buffer) self.buffer.clear() print(time.time() - self.t_start) chroma_thread = threading.Thread(target=self.chroma_rec, args=()) chroma_thread.start() #chord = chroma_thread.run() tmp2 = self.rnn(self.tmp) # tmp2 = librosa.onset.onset_strength(tmp,sr=self.RATE, hop_length = int(self.RATE / 100),max_size=1,aggregate=np.median, n_mels=256) # tmp2 /= np.max(tmp2) #t_axes = librosa.frames_to_time(np.arange(len(tmp2)),sr=self.RATE) t_proc = time.time() - self.t_start print(t_proc) tmp3_2 = self.act_proc(tmp2) tmp3_1 = 60 / np.mean(np.diff(tmp3_2)) # print(tmp3) #tmp3_1,tmp3_2 = librosa.beat.beat_track(onset_envelope=tmp2, sr=self.RATE) print('tempo is %f' % tmp3_1) print('beat is ', tmp3_2) t_proc = time.time() - self.t_start chroma_thread.join() print(t_proc) t = threading.Timer(60. / tmp3_1 - t_proc, self.flagit, ()) t.daemon = True t.start() print(int(tmp3_1)) self.test.StartInTime( np.mean(np.diff(tmp3_2)) * 4 - (14 - tmp3_2[-1]) - t_proc, int(tmp3_1)) break # self.stream.write(self.beepsnd) print(time.time() - self.t_start) else: time.sleep(0.001) while (1): time.sleep(0.01) else: self.stream = self.pa.open(format=self.FORMAT, channels=self.CHANNELS, rate=self.RATE, input=True, output=True, frames_per_buffer=self.CHUNK) self.t_start = time.time() self.loop()
def main(): video_dir = 'dance_videos\\Danny Ocean - Baby I Wont.mp4' beat_dir = video_dir.strip('mp4') + 'npy' interval = [32, 36] #in second REDU = True motion_base_dir = 'MyNao\\motion_base\\motion_base.json' if not os.path.exists(motion_base_dir): motion_base = {} with open(motion_base_dir, 'w') as f: json.dump(motion_base, f) with open(motion_base_dir, 'r') as f: motion_base = json.load(f) if REDU: pose_save_dir = 'MyNao\\motion_glance\\' + str(len(motion_base) - 1) else: pose_save_dir = 'MyNao\\motion_glance\\' + str(len(motion_base)) if not os.path.exists(pose_save_dir): os.mkdir(pose_save_dir) motion = {} motion['feature'] = {} motion['feature']['bps'] = [None] motion['feature']['symmetric'] = False motion['feature']['repeat'] = True motion['frame'] = {} #args = parse_args() #cfg.set_args(args.gpu_ids) cudnn.fastest = True cudnn.benchmark = True cudnn.deterministic = False cudnn.enabled = True time_0 = time.time() tester = Tester(24) ##loading 3D pose estimation model tester._make_model() time_1 = time.time() print('loading integral pose model elapse:', round(time_1 - time_0, 2), 's') ##loading yolo detector detector = YOLOv3( model_def= "3DMPPE_POSENET_RELEASE\\common\\detectors\\yolo\\config\\yolov3.cfg", class_path= "3DMPPE_POSENET_RELEASE\\common\\detectors\\yolo\\data\\coco.names", weights_path= "3DMPPE_POSENET_RELEASE\\common\\detectors\\yolo\\weights\\yolov3.weights", classes=('person', ), max_batch_size=16, device=torch.device('cuda:{}'.format(cfg.gpu_ids[0]))) print('loading yolo elapse:', round(time.time() - time_1, 2), 's') skeleton = ((0, 7), (7, 8), (8, 9), (9, 10), (8, 11), (11, 12), (12, 13), (8, 14), (14, 15), (15, 16), (0, 1), (1, 2), (2, 3), (0, 4), (4, 5), (5, 6)) fig = plt.figure(figsize=(10, 10)) transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(mean=cfg.pixel_mean, std=cfg.pixel_std) ]) ##load model if not os.path.exists(video_dir.strip('mp4') + 'wav'): videoclip = VideoFileClip(video_dir) audioclip = videoclip.audio audioclip.write_audiofile(video_dir.strip('mp4') + 'wav') video = cv2.VideoCapture(video_dir) if not os.path.exists(beat_dir): time_2 = time.time() videoclip = VideoFileClip(video_dir) audioclip = videoclip.audio beat_activation = RNNBeatProcessor()(video_dir.strip('mp4') + 'wav') processor = DBNBeatTrackingProcessor(fps=100) beats = processor(beat_activation) frames_at_beat = (beats / audioclip.duration * video.get(cv2.CAP_PROP_FRAME_COUNT)).astype(int) print('extracting beat sequence elapse:', round(time.time() - time_2, 2), 's') np.save(beat_dir, frames_at_beat) frames_at_beat = np.load(beat_dir).tolist() for beat in frames_at_beat: if interval[0] * video.get(cv2.CAP_PROP_FPS) > beat: continue else: interval[0] = beat break for beat in frames_at_beat: if interval[1] * video.get(cv2.CAP_PROP_FPS) > beat: continue else: interval[1] = beat break video.set(1, interval[0]) frame = 0 next_beat = 0 last_beat = 0 num_beat = 0 num_frame_between_beats = [] with torch.no_grad(): while True: time_start = time.time() current_frame = video.get(cv2.CAP_PROP_POS_FRAMES) ret_val, raw_image = video.read() if current_frame == interval[1]: break input_img = raw_image.copy() ##using yolo to get human bounding box detections = detector.predict_single(input_img) # if not detections.cpu().numpy().all(): # detections = (0,0,input_img.shape[1],input_img.shape[0],1,1) # print('not detected') if detections is None: detections = np.array( [[0, 0, input_img.shape[1], input_img.shape[0], 1, 1, 1]]) print('not detected') elif detections.size()[0] == 0: detections = np.array( [[0, 0, input_img.shape[1], input_img.shape[0], 1, 1, 1]]) print('not detected') last_conf = 0 last_last_conf = 0 for i, (x1_pred, y1_pred, x2_pred, y2_pred, conf, cls_conf, cls_pred) in enumerate(detections): if conf.item() > last_conf: x1 = int(round(x1_pred.item())) - 40 x2 = int(round(x2_pred.item())) + 40 y1 = int(round(y1_pred.item())) - 20 y2 = int( round(y2_pred.item()) ) + 20 #for getting a larger bounding box to cover the full body, in order to get more accurate pose last_last_conf = last_conf last_conf = conf.item() print(last_conf, last_last_conf) if last_last_conf != 0: sys.exit() #print(x1, x2, y1, y2, last_conf) img_patch = (input_img[y1:y2, x1:x2, ::-1]).copy().astype(np.float32) input_patch = cv2.resize(img_patch, (cfg.input_shape)) input_patch = transform(input_patch).unsqueeze(0) coord_out = tester.model(input_patch) print('Running model time:', round(time.time() - time_start, 2), 's') motion['frame'][frame] = {} if frame + interval[0] in frames_at_beat: motion['frame'][frame]['next_beat'] = 0 motion['frame'][frame]['last_beat'] = 0 #frames_at_beat.remove(frame) next_beat = frames_at_beat.index(frame + interval[0]) + 1 last_beat = frames_at_beat.index(frame + interval[0]) num_beat += 1 num_frame_between_beats.append(frames_at_beat[next_beat] - frames_at_beat[last_beat]) print('Record key frame with beat:', current_frame) else: motion['frame'][frame]['next_beat'] = frames_at_beat[ next_beat] - (frame + interval[0]) motion['frame'][frame]['last_beat'] = ( frame + interval[0]) - frames_at_beat[last_beat] coord_out = coord_out.cpu().numpy() coord_out_resize = coord_out * np.array([ img_patch.shape[1] / cfg.input_shape[1], img_patch.shape[0] / cfg.input_shape[0], 1 ]) for idx in range(coord_out_resize.shape[1] - 1): motion['frame'][frame][idx] = ( coord_out_resize[0][idx][0].item(), coord_out_resize[0][idx][2].item(), coord_out_resize[0][idx][1].item()) vis = True vis_3d = False if vis: tmpimg = input_patch[0].cpu().numpy() tmpimg = tmpimg * np.array(cfg.pixel_std).reshape( 3, 1, 1) + np.array(cfg.pixel_mean).reshape(3, 1, 1) tmpimg = (tmpimg).astype(np.uint8) tmpimg = tmpimg[::-1, :, :] tmpimg = np.transpose(tmpimg, (1, 2, 0)).copy() tmpkps = np.zeros((3, 18)) tmpkps[:2, :] = coord_out[0, :, :2].transpose( 1, 0) / cfg.output_shape[0] * cfg.input_shape[0] tmpkps[2, :] = 1 tmpimg = vis_keypoints(tmpimg, tmpkps, skeleton) tmpimg = cv2.resize(tmpimg, (img_patch.shape[1], img_patch.shape[0])) file_name = pose_save_dir + '\\{0}.png'.format( str(frame).zfill(4)) cv2.imwrite(file_name, tmpimg) if vis_3d: #coord_out = coord_out.cpu().numpy() #coord_out = coord_out * np.array([img_patch.shape[1]/cfg.input_shape[1], img_patch.shape[0]/cfg.input_shape[0], 1]) pred = coord_out_resize.squeeze( ) #remove first batch dimension ax = plt.subplot('121', projection='3d') plt.axis('off') show3D_pose(pred, ax, skeleton, radius=40) file_name = pose_save_dir + '\\{0}.png'.format( str(frame).zfill(4)) plt.savefig(file_name) # cv2.imwrite(file_name, tmpimg) frame += 1 print('Processing Frame:', round(time.time() - time_start, 2), 's') motion['feature']['fpb'] = np.mean(num_frame_between_beats) if REDU: motion_base[len(motion_base) - 1] = motion else: motion_base[len(motion_base)] = motion #with open(motion_base_dir, 'w') as f: # json.dump(motion_base, f) print('done with', num_beat + 1, 'beats! (This should be even for a normal dance)') print('num_frame between beats:') print(num_frame_between_beats)
import sys import bmaFunctions import numpy from madmom.features.chords import DeepChromaChordRecognitionProcessor from madmom.audio.chroma import DeepChromaProcessor from madmom.features.beats import DBNBeatTrackingProcessor from madmom.features.beats import RNNBeatProcessor #Setting up Deep Chroma Chord Recognition Processor dcp = DeepChromaProcessor() decode = DeepChromaChordRecognitionProcessor() chroma = dcp(sys.argv[1]) chords = decode(chroma) #Setting up Dynamic Baysian Network Tracking Processor proc = DBNBeatTrackingProcessor(fps=100) act = RNNBeatProcessor()(sys.argv[1]) beats = proc(act) #calculating msi beatsArray = numpy.array(beats) msi = numpy.mean(beatsArray[1:] - beatsArray[:-1]) * 1000 beatmap = bmaFunctions.assignKeys(beats, chords, sys.argv[3]) if msi < 360: del beatmap[1::2] #generating and printing beatmap bmaFunctions.fancyPrint(beatmap, msi, sys.argv[2]) #TODO: eliminate trailing Ns
def main(): video_list = ['Cant stop the feeling - Justin Timberlake - Easy Dance for Kids', 'Dance like yo daddy', 'Danny Ocean - Baby I Wont', 'Si una vez - If I Once', 'Vaiven - MegaMix'] for video in video_list: video_dir = 'dance_videos\\' + video + '.mp4' beat_dir = video_dir.strip('mp4') + 'npy' cudnn.fastest = True cudnn.benchmark = True cudnn.deterministic = False cudnn.enabled = True time_0 = time.time() tester = Tester(24) ##loading 3D pose estimation model tester._make_model() time_1 = time.time() print('loading integral pose model elapse:',round(time_1-time_0,2),'s') ##loading yolo detector detector = YOLOv3( model_def="3DMPPE_POSENET_RELEASE\\common\\detectors\\yolo\\config\\yolov3.cfg", class_path="3DMPPE_POSENET_RELEASE\\common\\detectors\\yolo\\data\\coco.names", weights_path="3DMPPE_POSENET_RELEASE\\common\\detectors\\yolo\\weights\\yolov3.weights", classes=('person',), max_batch_size=16, device=torch.device('cuda:{}'.format(cfg.gpu_ids[0]))) print('loading yolo elapse:',round(time.time()-time_1,2),'s') skeleton = ( (0, 7), (7, 8), (8, 9), (9, 10), (8, 11), (11, 12), (12, 13), (8, 14), (14, 15), (15, 16), (0, 1), (1, 2), (2, 3), (0, 4), (4, 5), (5, 6) ) #fig = plt.figure(figsize=(10,10)) transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize(mean=cfg.pixel_mean, std=cfg.pixel_std)] ) if not os.path.exists(video_dir.strip('mp4')+'wav'): videoclip = VideoFileClip(video_dir) audioclip = videoclip.audio audioclip.write_audiofile(video_dir.strip('mp4')+'wav') video = cv2.VideoCapture(video_dir) if not os.path.exists(beat_dir): time_2 = time.time() videoclip = VideoFileClip(video_dir) audioclip = videoclip.audio beat_activation = RNNBeatProcessor()(video_dir.strip('mp4')+'wav') processor = DBNBeatTrackingProcessor(fps=100) beats = processor(beat_activation) frames_at_beat = (beats/audioclip.duration*video.get(cv2.CAP_PROP_FRAME_COUNT)).astype(int) print('extracting beat sequence elapse:', round(time.time()-time_2, 2), 's') np.save(beat_dir, frames_at_beat) frames_at_beat = np.load(beat_dir).tolist() ########################################## dance_primitives_dir = '.\\danceprimitives_trial' if not os.path.exists(dance_primitives_dir): os.mkdir(dance_primitives_dir) motion_index = len(os.listdir(dance_primitives_dir)) for i in range(len(frames_at_beat)-1): motion_dir = os.path.join(dance_primitives_dir, '{0}'.format(str(motion_index).zfill(5))) if not os.path.exists(motion_dir): os.mkdir(motion_dir) start = frames_at_beat[i] end =frames_at_beat[i+1] dance_primitive = np.empty((0, 17*3)) # for motion control #dance_primitive_norm = np.empty((0, 17*3)) # for motion clustering video.set(1, start) jump_flag = 0 frame = 0 with torch.no_grad(): time_start = time.time() while True: current_frame = video.get(cv2.CAP_PROP_POS_FRAMES) ret_val, raw_image = video.read() if current_frame == end: break ##using yolo to get human bounding box input_img = raw_image.copy() detections = detector.predict_single(input_img) if detections is None or detections.size()[0] == 0: jump_flag = 1 break last_conf = 0 for i, (x1_pred, y1_pred, x2_pred, y2_pred, conf, cls_conf, cls_pred) in enumerate(detections): if conf.item() > last_conf: x1 = max(int(round(x1_pred.item())) - 40, 0) x2 = min(int(round(x2_pred.item())) + 40, input_img.shape[1]-1) y1 = max(int(round(y1_pred.item())) - 20, 0) y2 = min(int(round(y2_pred.item())) + 20, input_img.shape[0]-1) #for getting a larger bounding box to cover the full body, in order to get more accurate pose last_conf = conf.item() img_patch = (input_img[y1:y2, x1:x2, ::-1]).copy().astype(np.float32) ##using ResPoseNet to get 3D human pose input_patch = cv2.resize(img_patch,(cfg.input_shape)) input_patch = transform(input_patch).unsqueeze(0) coord_out = tester.model(input_patch).cpu().numpy() #dimention: 1 X 18 X 3, where '3' refers to x, z, y in sequence. #show_pose(input_patch, img_patch, coord_out, skeleton, motion_dir, frame) coord_out_resize = coord_out * np.array([img_patch.shape[1]/cfg.input_shape[1], img_patch.shape[0]/cfg.input_shape[0], 1]) #transform to original scale coord_out = coord_out_resize[:, :-1, :] # neglect the key point for "throx" #coord_out_norm = (coord_out-np.mean(coord_out, axis=1))/np.std(coord_out, axis=1) dance_primitive = np.vstack((dance_primitive, np.reshape(coord_out[0], -1))) #dance_primitive_norm = np.vstack((dance_primitive_norm, np.reshape(coord_out_norm[0], -1))) frame += 1 print('Processing Time Elapse:', round(time.time()-time_start,2), 's') if jump_flag == 1: continue #norm_sample = np.empty((0, 17*3)) #num_sample = 10 #print(dance_primitive_norm.shape[0]) #sample_step = (dance_primitive_norm.shape[0]-1)/(num_sample-1) #for i in range(num_sample): # norm_sample = np.vstack((norm_sample, dance_primitive_norm[round(i * sample_step)])) #print(norm_sample.shape) print(dance_primitive.shape) #np.save(os.path.join(motion_dir, 'dance_motion_normlized_'+ str(motion_index)), norm_sample) np.save(os.path.join(motion_dir, 'dance_motion_'+ str(motion_index)), dance_primitive) motion_index+=1 ########################################### sys.exit() video.set(1, interval[0]) frame=0 next_beat = 0 last_beat = 0 num_beat = 0 num_frame_between_beats = [] with torch.no_grad(): while True: time_start = time.time() current_frame = video.get(cv2.CAP_PROP_POS_FRAMES) ret_val, raw_image = video.read() if current_frame == interval[1]: break input_img = raw_image.copy() ##using yolo to get human bounding box detections = detector.predict_single(input_img) # if not detections.cpu().numpy().all(): # detections = (0,0,input_img.shape[1],input_img.shape[0],1,1) # print('not detected') if detections is None: detections = np.array([[0,0,input_img.shape[1],input_img.shape[0],1,1,1]]) print('not detected') elif detections.size()[0] == 0: detections = np.array([[0,0,input_img.shape[1],input_img.shape[0],1,1,1]]) print('not detected') last_conf = 0 last_last_conf = 0 for i, (x1_pred, y1_pred, x2_pred, y2_pred, conf, cls_conf, cls_pred) in enumerate(detections): if conf.item() > last_conf: x1 = int(round(x1_pred.item())) - 40 x2 = int(round(x2_pred.item())) + 40 y1 = int(round(y1_pred.item())) - 20 y2 = int(round(y2_pred.item())) + 20 #for getting a larger bounding box to cover the full body, in order to get more accurate pose last_last_conf = last_conf last_conf = conf.item() print(last_conf, last_last_conf) if last_last_conf != 0: sys.exit() #print(x1, x2, y1, y2, last_conf) img_patch = (input_img[y1:y2, x1:x2, ::-1]).copy().astype(np.float32) input_patch = cv2.resize(img_patch,(cfg.input_shape)) input_patch = transform(input_patch).unsqueeze(0) coord_out = tester.model(input_patch) print('Running model time:',round(time.time()-time_start,2),'s') motion['frame'][frame] = {} if frame+interval[0] in frames_at_beat: motion['frame'][frame]['next_beat'] = 0 motion['frame'][frame]['last_beat'] = 0 #frames_at_beat.remove(frame) next_beat = frames_at_beat.index(frame+interval[0]) + 1 last_beat = frames_at_beat.index(frame+interval[0]) num_beat += 1 num_frame_between_beats.append(frames_at_beat[next_beat] - frames_at_beat[last_beat]) print('Record key frame with beat:', current_frame) else: motion['frame'][frame]['next_beat'] = frames_at_beat[next_beat] - (frame+interval[0]) motion['frame'][frame]['last_beat'] = (frame+interval[0]) - frames_at_beat[last_beat] coord_out = coord_out.cpu().numpy() coord_out_resize = coord_out * np.array([img_patch.shape[1]/cfg.input_shape[1], img_patch.shape[0]/cfg.input_shape[0], 1]) for idx in range(coord_out_resize.shape[1]-1): motion['frame'][frame][idx]=(coord_out_resize[0][idx][0].item(), coord_out_resize[0][idx][2].item(), coord_out_resize[0][idx][1].item()) vis = True vis_3d = False if vis: tmpimg = input_patch[0].cpu().numpy() tmpimg = tmpimg * np.array(cfg.pixel_std).reshape(3,1,1) + np.array(cfg.pixel_mean).reshape(3,1,1) tmpimg = (tmpimg).astype(np.uint8) tmpimg = tmpimg[::-1, :, :] tmpimg = np.transpose(tmpimg,(1,2,0)).copy() tmpkps = np.zeros((3,18)) tmpkps[:2,:] = coord_out[0,:,:2].transpose(1,0) / cfg.output_shape[0] * cfg.input_shape[0] tmpkps[2,:] = 1 tmpimg = vis_keypoints(tmpimg, tmpkps, skeleton) tmpimg = cv2.resize(tmpimg,(img_patch.shape[1],img_patch.shape[0])) file_name = pose_save_dir+'\\{0}.png'.format(str(frame).zfill(4)) cv2.imwrite(file_name, tmpimg) if vis_3d: #coord_out = coord_out.cpu().numpy() #coord_out = coord_out * np.array([img_patch.shape[1]/cfg.input_shape[1], img_patch.shape[0]/cfg.input_shape[0], 1]) pred=coord_out_resize.squeeze() #remove first batch dimension ax=plt.subplot('121',projection='3d') plt.axis('off') show3D_pose(pred,ax,skeleton,radius=40) file_name = pose_save_dir + '\\{0}.png'.format(str(frame).zfill(4)) plt.savefig(file_name) # cv2.imwrite(file_name, tmpimg) frame+=1 print('Processing Frame:',round(time.time()-time_start,2),'s') motion['feature']['fpb'] = np.mean(num_frame_between_beats) if REDU: motion_base[len(motion_base)-1] = motion else: motion_base[len(motion_base)] = motion #with open(motion_base_dir, 'w') as f: # json.dump(motion_base, f) print('done with', num_beat + 1, 'beats! (This should be even for a normal dance)') print('num_frame between beats:') print(num_frame_between_beats)
def main(): """DBNBeatTracker""" # define parser p = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description=''' The DBNBeatTracker.py program detects all beats in an audio file according to the method described in: "A Multi-Model Approach to Beat Tracking Considering Heterogeneous Music Styles" Sebastian Böck, Florian Krebs and Gerhard Widmer. Proceedings of the 15th International Society for Music Information Retrieval Conference (ISMIR), 2014. It does not use the multi-model (Section 2.2.) and selection stage (Section 2.3), i.e. this version corresponds to the pure DBN version of the algorithm for which results are given in Table 2. Instead of the originally proposed state space and transition model for the DBN, the following is used: "An Efficient State Space Model for Joint Tempo and Meter Tracking" Florian Krebs, Sebastian Böck and Gerhard Widmer. Proceedings of the 16th International Society for Music Information Retrieval Conference (ISMIR), 2015. This program can be run in 'single' file mode to process a single audio file and write the detected beats to STDOUT or the given output file. $ DBNBeatTracker.py single INFILE [-o OUTFILE] If multiple audio files should be processed, the program can also be run in 'batch' mode to save the detected beats to files with the given suffix. $ DBNBeatTracker.py batch [-o OUTPUT_DIR] [-s OUTPUT_SUFFIX] FILES If no output directory is given, the program writes the files with the detected beats to the same location as the audio files. The 'pickle' mode can be used to store the used parameters to be able to exactly reproduce experiments. ''') # version p.add_argument('--version', action='version', version='DBNBeatTracker.py.2016') # input/output options io_arguments(p, output_suffix='.beats.txt', online=True) ActivationsProcessor.add_arguments(p) # signal processing arguments SignalProcessor.add_arguments(p, norm=False, gain=0) # peak picking arguments DBNBeatTrackingProcessor.add_arguments(p) NeuralNetworkEnsemble.add_arguments(p, nn_files=None) # parse arguments args = p.parse_args() # set immutable arguments args.fps = 100 # print arguments if args.verbose: print(args) # input processor if args.load: # load the activations from file in_processor = ActivationsProcessor(mode='r', **vars(args)) else: # use a RNN to predict the beats in_processor = RNNBeatProcessor(**vars(args)) # output processor if args.save: # save the RNN beat activations to file out_processor = ActivationsProcessor(mode='w', **vars(args)) else: # track the beats with a DBN beat_processor = DBNBeatTrackingProcessor(**vars(args)) # output handler from madmom.utils import write_events as writer # sequentially process everything out_processor = [beat_processor, writer] # create an IOProcessor processor = IOProcessor(in_processor, out_processor) # and call the processing function args.func(processor, **vars(args))
def activation2beat(activation, fps=100): return DBNBeatTrackingProcessor(fps=100)(activation)