def __init__(self, cfg: ConfigTree): self.cfg = cfg print(cfg) self.summary_writer = SummaryWriter(log_dir=experiment_path) self.model_builder = ModelFactory(cfg) self.dataset_builder = DataLoaderFactory(cfg) self.train_ds = self.dataset_builder.build(split='train') self.test_ds = self.dataset_builder.build(split='val') self.ds: YoutubeDataset = self.train_ds.dataset self.train_criterion = nn.CrossEntropyLoss( ignore_index=self.ds.PAD_IDX) self.val_criterion = nn.CrossEntropyLoss(ignore_index=self.ds.PAD_IDX) self.model: nn.Module = self.model_builder.build( device=torch.device('cuda'), wrapper=nn.DataParallel) optimizer = optim.Adam(self.model.parameters(), lr=0., betas=(0.9, 0.98), eps=1e-9) self.optimizer = CustomSchedule( self.cfg.get_int('model.emb_dim'), optimizer=optimizer, ) self.num_epochs = cfg.get_int('num_epochs') logger.info(f'Use control: {self.ds.use_control}')
def __init__(self, cfg: ConfigTree, args): #pdb.set_trace() self.cfg = cfg self.device = self.cfg.get_string('device') self.summary_writer = SummaryWriter(log_dir=experiment_path) self.model_builder = ModelFactory(cfg) self.dataset_builder = DataLoaderFactory(cfg) self.train_ds = self.dataset_builder.build(split='train') self.test_ds = self.dataset_builder.build(split='val') self.ds: YoutubeDataset = self.train_ds.dataset self.train_criterion = nn.CrossEntropyLoss( ignore_index=self.ds.PAD_IDX) self.val_criterion = nn.CrossEntropyLoss(ignore_index=self.ds.PAD_IDX) self.model: nn.Module = self.model_builder.build( device=torch.device(self.device), wrapper=nn.DataParallel) self.optimizer = optim.Adam(self.model.parameters(), lr=3e-4, betas=(0.9, 0.98), eps=1e-9, weight_decay=1e-5) """ self.optimizer = CustomSchedule( self.cfg.get_int('model.emb_dim'), optimizer=optimizer, ) """ self.num_epochs = cfg.get_int('num_epochs') if (args.load): print("loading model...") checkpoint = torch.load(experiment_path + '/checkpoint.pth.tar') self.model.load_state_dict(checkpoint['model_state_dict']) self.optimizer.load_state_dict(checkpoint['optimizer_state_dict']) epoch = checkpoint['epoch'] self.num_epochs -= epoch self.epochs_left = epoch #self.loss = checkpoint['loss'] else: self.epochs_left = 0 #checkpoint = torch.load('../Foley-Music/exps/urmp-vn/checkpoint.pth.tar') #Transfer learning #self.model.load_state_dict(checkpoint['model_state_dict'], strict=False) logger.info(f'Use control: {self.ds.use_control}') self.duration = self.cfg.get_float('dataset.duration')
def main(args): torch.set_grad_enabled(False) checkpoint_path = Path(args.checkpoint) output_dir = Path(args.output) if args.control is not None: control_tensor = utils.midi.pitch_histogram_string_to_control_tensor( args.control) else: control_tensor = None cp = torch.load(checkpoint_path) cfg = ConfigFactory.parse_file(checkpoint_path.parent / 'config.conf') instrument = cfg.get_string('dataset.instrument', args.instrument) pprint(cfg) print('Using Instrument:', instrument) model_factory = ModelFactory(cfg) dataloader_factory = DataLoaderFactory(cfg) device = cfg.get_string('device') rnn = cfg.get_bool('model.rnn') #model: MusicTransformer = model_factory.build(device=DEVICE) model: nn.Module = model_factory.build(device=torch.device(device), wrapper=nn.DataParallel) model.load_state_dict(cp['model_state_dict']) model.eval() dl = dataloader_factory.build(split='test') ds: YoutubeDataset = dl.dataset pprint(ds.samples[:5]) length = cfg.get_float( 'dataset.duration') # how long is your produced audio # One is for generated audio, one is for generated video os.makedirs(output_dir / 'audio', exist_ok=True) os.makedirs(output_dir / 'video', exist_ok=True) test_criterion = nn.CrossEntropyLoss(ignore_index=ds.PAD_IDX) print(len(ds), "samples") for data in tqdm(ds): #pdb.set_trace() index = data['index'] imu = data['imu'] midi_x, midi_y = data['midi_x'], data['midi_y'] if device == 'cuda': imu = imu.cuda(non_blocking=True) if control_tensor is not None: control_tensor = control_tensor.cuda(non_blocking=True) sample = ds.samples[index] if rnn: imu = model.module.forward_imu_net(imu.unsqueeze(0)) events = torch.from_numpy( model.module.transformer.predict(imu.squeeze(1), ds.num_events)) events = model.module.softmax( model.module.get_output(events.unsqueeze(1))) events = torch.max(events, dim=2)[1] else: events = model.module.generate( imu.unsqueeze(0), target_seq_length=ds.num_events, beam=5, pad_idx=ds.PAD_IDX, sos_idx=ds.SOS_IDX, eos_idx=ds.EOS_IDX, control=control_tensor, ) #pdb.set_trace() if events.shape[1] <= 0: print('=' * 100) print('not events') print(sample) print('=' * 100) continue print('this events shape: ', events.shape) print('this events length: ', len(events)) """ mask = (midi_x != ds.PAD_IDX) out = events.squeeze()[mask] tgt = midi_x[mask] num_right = (out == tgt) num_right = torch.sum(num_right).float() acc = num_right / len(tgt) print("Accuracy", acc) """ """ loss = test_criterion(events, midi_x) acc = compute_epiano_accuracy(events, midi_x) batch_size = len(midi_x) loss_meter.update(loss.item(), batch_size) acc_meter.update(acc.item(), batch_size) logger.info( f'Val [{epoch}]/{self.num_epochs}][{i}/{num_iters}]\t' f'{loss_meter}\t{acc_meter}' ) """ ss = change_time_format(sample.start_time) dd = change_time_format(sample.start_time + length) add_name = '-' + ss + '-' + dd folder_name = "samples" midi_filename = sample.vid audio_filename = sample.vid midi_dir = output_dir / 'midi' os.makedirs(midi_dir, exist_ok=True) midi_dir2 = output_dir / 'midi2' os.makedirs(midi_dir2, exist_ok=True) midi_path = midi_dir / f'{midi_filename}{add_name}.midi' pm = utils.midi.tensor_to_pm(events.squeeze(), instrument=instrument) pm.write(str(midi_path)) midi_path2 = midi_dir2 / f'{midi_filename}{add_name}.midi' pm2 = utils.midi.tensor_to_pm(midi_x, instrument=instrument) pm2.write(str(midi_path2)) audio_dir = output_dir / 'audio' / f'{folder_name}' os.makedirs(audio_dir, exist_ok=True) audio_path = audio_dir / f'{audio_filename}{add_name}.wav' utils.midi.pm_to_wav( pm, audio_path, rate=22050, )
def main(args): torch.set_grad_enabled(False) checkpoint_path = Path(args.checkpoint) video_dir = Path(args.video) output_dir = Path(args.output) if args.control is not None: control_tensor = utils.midi.pitch_histogram_string_to_control_tensor( args.control) else: control_tensor = None cp = torch.load(checkpoint_path) cfg = ConfigFactory.parse_file(checkpoint_path.parent / 'config.conf') instrument = cfg.get_string('dataset.instrument', args.instrument) pprint(cfg) print('Using Instrument:', instrument) model_factory = ModelFactory(cfg) dataloader_factory = DataLoaderFactory(cfg) model: MusicTransformer = model_factory.build(device=DEVICE) model.load_state_dict(cp['state_dict']) model.eval() dl = dataloader_factory.build(split='val') ds: YoutubeDataset = dl.dataset pprint(ds.samples[:5]) length = cfg.get_float( 'dataset.duration') # how long is your produced audio os.makedirs(output_dir / 'audio', exist_ok=True) os.makedirs(output_dir / 'video', exist_ok=True) for data in tqdm(ds): index = data['index'] pose = data['pose'] pose = pose.cuda(non_blocking=True) if control_tensor is not None: control_tensor = control_tensor.cuda(non_blocking=True) sample = ds.samples[index] events = model.generate( pose.unsqueeze(0), target_seq_length=ds.num_events, beam=5, pad_idx=ds.PAD_IDX, sos_idx=ds.SOS_IDX, eos_idx=ds.EOS_IDX, control=control_tensor, ) if events.shape[1] <= 0: print('=' * 100) print('not events') print(sample) print('=' * 100) continue print('this events shape: ', events.shape) print('this events length: ', len(events)) try: video_path = next(video_dir.glob(f'{sample.vid}.*')) except Exception as e: print(e) print('skip') if args.only_audio: pass else: continue ss = change_time_format(sample.start_time) dd = change_time_format(sample.start_time + length) add_name = '-' + ss + '-' + dd midi_dir = output_dir / 'midi' / f'{sample.vid}' os.makedirs(midi_dir, exist_ok=True) midi_path = midi_dir / f'{sample.vid}{add_name}.midi' pm = utils.midi.tensor_to_pm(events.squeeze(), instrument=instrument) pm.write(str(midi_path)) audio_dir = output_dir / 'audio' / f'{sample.vid}' os.makedirs(audio_dir, exist_ok=True) audio_path = audio_dir / f'{sample.vid}{add_name}.wav' utils.midi.pm_to_wav( pm, audio_path, rate=22050, ) if not args.only_audio: # find only video in val.csv in_path = get_video_path(video_dir, sample.vid) vid_name = sample.vid vid_dir = os.path.join(output_dir, 'video', vid_name) if not os.path.exists(vid_dir): os.mkdir(vid_dir) # cut video to fixed length vid_dir_name = sample.vid # just name, no suffix like .mp4 cut_name = str(vid_dir_name) + add_name + '_middle.mp4' # concat audio and video vid_path = os.path.join(vid_dir, str(vid_dir_name) + add_name + '.mp4') cmd2 = f'ffmpeg -y -ss {ss} -i {in_path} -t {length} -i {str(audio_path)} -t {length} -map 0:v:0 -map 1:a:0 -c:v libx264 -c:a aac -strict experimental {vid_path}' os.system(cmd2)
class Engine(BaseEngine): def __init__(self, cfg: ConfigTree, args): #pdb.set_trace() self.cfg = cfg self.device = self.cfg.get_string('device') self.summary_writer = SummaryWriter(log_dir=experiment_path) self.model_builder = ModelFactory(cfg) self.dataset_builder = DataLoaderFactory(cfg) self.train_ds = self.dataset_builder.build(split='train') self.test_ds = self.dataset_builder.build(split='val') self.ds: YoutubeDataset = self.train_ds.dataset self.train_criterion = nn.CrossEntropyLoss( ignore_index=self.ds.PAD_IDX) self.val_criterion = nn.CrossEntropyLoss(ignore_index=self.ds.PAD_IDX) self.model: nn.Module = self.model_builder.build( device=torch.device(self.device), wrapper=nn.DataParallel) self.optimizer = optim.Adam(self.model.parameters(), lr=3e-4, betas=(0.9, 0.98), eps=1e-9, weight_decay=1e-5) """ self.optimizer = CustomSchedule( self.cfg.get_int('model.emb_dim'), optimizer=optimizer, ) """ self.num_epochs = cfg.get_int('num_epochs') if (args.load): print("loading model...") checkpoint = torch.load(experiment_path + '/checkpoint.pth.tar') self.model.load_state_dict(checkpoint['model_state_dict']) self.optimizer.load_state_dict(checkpoint['optimizer_state_dict']) epoch = checkpoint['epoch'] self.num_epochs -= epoch self.epochs_left = epoch #self.loss = checkpoint['loss'] else: self.epochs_left = 0 #checkpoint = torch.load('../Foley-Music/exps/urmp-vn/checkpoint.pth.tar') #Transfer learning #self.model.load_state_dict(checkpoint['model_state_dict'], strict=False) logger.info(f'Use control: {self.ds.use_control}') self.duration = self.cfg.get_float('dataset.duration') def train(self, epoch=0): loss_meter = AverageMeter('Loss') acc_meter = AverageMeter('Acc') num_iters = len(self.train_ds) self.model.train() count = 0 model_parameters = filter(lambda p: p.requires_grad, self.model.parameters()) count = sum([np.prod(p.size()) for p in model_parameters]) print(count) for i, data in enumerate(self.train_ds): midi_x, midi_y = data['midi_x'], data['midi_y'] #pdb.set_trace() if self.ds.use_pose: feat = data['pose'] elif self.ds.use_rgb: feat = data['rgb'] elif self.ds.use_flow: feat = data['flow'] elif self.ds.use_imu: feat = data['imu'] else: raise Exception('No feature!') if (self.device == 'cuda'): feat, midi_x, midi_y = (feat.cuda(non_blocking=True), midi_x.cuda(non_blocking=True), midi_y.cuda(non_blocking=True)) if self.ds.use_control: control = data['control'] control = control.cuda(non_blocking=True) else: control = None output = self.model(feat, midi_x, pad_idx=self.ds.PAD_IDX, control=control) loss = self.train_criterion(output.view(-1, output.shape[-1]), midi_y.flatten()) self.optimizer.zero_grad() loss.backward() self.optimizer.step() acc = compute_epiano_accuracy(output, midi_y, pad_idx=self.ds.PAD_IDX) batch_size = len(midi_x) loss_meter.update(loss.item(), batch_size) acc_meter.update(acc.item(), batch_size) logger.info( f'Train [{epoch}]/{self.num_epochs}][{i}/{num_iters}]\t' f'{loss_meter}\t{acc_meter}') self.summary_writer.add_scalar('train/loss', loss_meter.avg, epoch) self.summary_writer.add_scalar('train/acc', acc_meter.avg, epoch) return loss_meter.avg def test(self, epoch=0): loss_meter = AverageMeter('Loss') acc_meter = AverageMeter('Acc') num_iters = len(self.test_ds) self.model.eval() with torch.no_grad(): for i, data in enumerate(self.test_ds): midi_x, midi_y = data['midi_x'], data['midi_y'] #pdb.set_trace() if self.ds.use_pose: feat = data['pose'] elif self.ds.use_rgb: feat = data['rgb'] elif self.ds.use_flow: feat = data['flow'] elif self.ds.use_imu: feat = data['imu'] else: raise Exception('No feature!') if (self.device == 'cuda'): feat, midi_x, midi_y = (feat.cuda(non_blocking=True), midi_x.cuda(non_blocking=True), midi_y.cuda(non_blocking=True)) if self.ds.use_control: control = data['control'] control = control.cuda(non_blocking=True) else: control = None output = self.model(feat, midi_x, pad_idx=self.ds.PAD_IDX, control=control) """ For CrossEntropy output: [B, T, D] -> [BT, D] target: [B, T] -> [BT] """ loss = self.val_criterion(output.view(-1, output.shape[-1]), midi_y.flatten()) acc = compute_epiano_accuracy(output, midi_y) batch_size = len(midi_x) loss_meter.update(loss.item(), batch_size) acc_meter.update(acc.item(), batch_size) logger.info( f'Val [{epoch}]/{self.num_epochs}][{i}/{num_iters}]\t' f'{loss_meter}\t{acc_meter}') self.summary_writer.add_scalar('val/loss', loss_meter.avg, epoch) self.summary_writer.add_scalar('val/acc', acc_meter.avg, epoch) return loss_meter.avg @staticmethod def epoch_time(start_time: float, end_time: float): elapsed_time = end_time - start_time elapsed_mins = int(elapsed_time / 60) elapsed_secs = int(elapsed_time - (elapsed_mins * 60)) return elapsed_mins, elapsed_secs def run(self): best_loss = float('inf') for epoch in range(self.num_epochs): start_time = time.time() _train_loss = self.train(epoch) loss = self.test(epoch) end_time = time.time() epoch_mins, epoch_secs = self.epoch_time(start_time, end_time) logger.info( f'Epoch: {epoch + 1:02} | Time: {epoch_mins}m {epoch_secs}s') is_best = loss < best_loss best_loss = min(loss, best_loss) torch.save( { 'epoch': epoch + self.epochs_left, 'model_state_dict': self.model.state_dict(), 'optimizer_state_dict': self.optimizer.state_dict(), 'loss': loss, }, experiment_path + '/' + 'checkpoint.pth.tar') def close(self): self.summary_writer.close()
class Engine(BaseEngine): def __init__(self, cfg: ConfigTree): self.cfg = cfg print(cfg) self.summary_writer = SummaryWriter(log_dir=experiment_path) self.model_builder = ModelFactory(cfg) self.dataset_builder = DataLoaderFactory(cfg) self.train_ds = self.dataset_builder.build(split='train') self.test_ds = self.dataset_builder.build(split='val') self.ds: YoutubeDataset = self.train_ds.dataset self.train_criterion = nn.CrossEntropyLoss( ignore_index=self.ds.PAD_IDX) self.val_criterion = nn.CrossEntropyLoss(ignore_index=self.ds.PAD_IDX) self.model: nn.Module = self.model_builder.build( device=torch.device('cuda'), wrapper=nn.DataParallel) optimizer = optim.Adam(self.model.parameters(), lr=0., betas=(0.9, 0.98), eps=1e-9) self.optimizer = CustomSchedule( self.cfg.get_int('model.emb_dim'), optimizer=optimizer, ) self.num_epochs = cfg.get_int('num_epochs') logger.info(f'Use control: {self.ds.use_control}') def train(self, epoch=0): loss_meter = AverageMeter('Loss') acc_meter = AverageMeter('Acc') num_iters = len(self.train_ds) self.model.train() for i, data in enumerate(self.train_ds): midi_x, midi_y = data['midi_x'], data['midi_y'] if self.ds.use_pose: feat = data['pose'] elif self.ds.use_rgb: feat = data['rgb'] elif self.ds.use_flow: feat = data['flow'] else: raise Exception('No feature!') feat, midi_x, midi_y = (feat.cuda(non_blocking=True), midi_x.cuda(non_blocking=True), midi_y.cuda(non_blocking=True)) if self.ds.use_control: control = data['control'] control = control.cuda(non_blocking=True) else: control = None output = self.model(feat, midi_x, pad_idx=self.ds.PAD_IDX, control=control) loss = self.train_criterion(output.view(-1, output.shape[-1]), midi_y.flatten()) self.optimizer.zero_grad() loss.backward() self.optimizer.step() acc = compute_epiano_accuracy(output, midi_y, pad_idx=self.ds.PAD_IDX) batch_size = len(midi_x) loss_meter.update(loss.item(), batch_size) acc_meter.update(acc.item(), batch_size) logger.info( f'Train [{epoch}]/{self.num_epochs}][{i}/{num_iters}]\t' f'{loss_meter}\t{acc_meter}') self.summary_writer.add_scalar('train/loss', loss_meter.avg, epoch) self.summary_writer.add_scalar('train/acc', acc_meter.avg, epoch) return loss_meter.avg def test(self, epoch=0): loss_meter = AverageMeter('Loss') acc_meter = AverageMeter('Acc') num_iters = len(self.test_ds) self.model.eval() with torch.no_grad(): for i, data in enumerate(self.test_ds): midi_x, midi_y = data['midi_x'], data['midi_y'] if self.ds.use_pose: feat = data['pose'] elif self.ds.use_rgb: feat = data['rgb'] elif self.ds.use_flow: feat = data['flow'] else: raise Exception('No feature!') feat, midi_x, midi_y = (feat.cuda(non_blocking=True), midi_x.cuda(non_blocking=True), midi_y.cuda(non_blocking=True)) if self.ds.use_control: control = data['control'] control = control.cuda(non_blocking=True) else: control = None output = self.model(feat, midi_x, pad_idx=self.ds.PAD_IDX, control=control) """ For CrossEntropy output: [B, T, D] -> [BT, D] target: [B, T] -> [BT] """ loss = self.val_criterion(output.view(-1, output.shape[-1]), midi_y.flatten()) acc = compute_epiano_accuracy(output, midi_y) batch_size = len(midi_x) loss_meter.update(loss.item(), batch_size) acc_meter.update(acc.item(), batch_size) logger.info( f'Val [{epoch}]/{self.num_epochs}][{i}/{num_iters}]\t' f'{loss_meter}\t{acc_meter}') self.summary_writer.add_scalar('val/loss', loss_meter.avg, epoch) self.summary_writer.add_scalar('val/acc', acc_meter.avg, epoch) return loss_meter.avg @staticmethod def epoch_time(start_time: float, end_time: float): elapsed_time = end_time - start_time elapsed_mins = int(elapsed_time / 60) elapsed_secs = int(elapsed_time - (elapsed_mins * 60)) return elapsed_mins, elapsed_secs def run(self): best_loss = float('inf') for epoch in range(self.num_epochs): start_time = time.time() _train_loss = self.train(epoch) loss = self.test(epoch) end_time = time.time() epoch_mins, epoch_secs = self.epoch_time(start_time, end_time) logger.info( f'Epoch: {epoch + 1:02} | Time: {epoch_mins}m {epoch_secs}s') is_best = loss < best_loss best_loss = min(loss, best_loss) save_checkpoint( { 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict() }, is_best=is_best, folder=experiment_path) def close(self): self.summary_writer.close()