list_datasets = [train_synth_data] training_data = train_synth_data scaler = Scaler() scaler.calculate_scaler(training_data) LOG.debug(scaler.mean_) transforms_valid = get_transforms(cfg.max_frames, scaler=scaler) # Validation dataset is only used to get an idea of wha could be results on evaluation dataset validation_dataset = DataLoadDf(validation_df, dataset.get_feature_file, many_hot_encoder.encode_strong_df, transform=transforms_valid) transforms = get_transforms(cfg.max_frames, scaler) train_synth_data.set_transform(transforms) if not no_weak: train_weak_data.set_transform(transforms) concat_dataset = ConcatDataset([train_weak_data, train_synth_data]) # Taking as much data from synthetic than strong. sampler = MultiStreamBatchSampler( concat_dataset, batch_sizes=[cfg.batch_size // 2, cfg.batch_size // 2]) training_data = DataLoader(concat_dataset, batch_sampler=sampler) valid_weak_data = DataLoadDf(valid_weak_df, dataset.get_feature_file, many_hot_encoder.encode_strong_df, transform=transforms_valid) weak_mask = slice(cfg.batch_size // 2) strong_mask = slice(cfg.batch_size // 2, cfg.batch_size) else:
scaler = Scaler() if path.exists(cfg.scaler_fn): LOG.info('Loading scaler from {}'.format(cfg.scaler_fn)) scaler.load(cfg.scaler_fn) else: scaler.calculate_scaler(ConcatDataset(list_dataset)) LOG.info('Saving scaler to {}'.format(cfg.scaler_fn)) scaler.save(cfg.scaler_fn) LOG.debug(scaler.mean_) transforms = get_transforms(cfg.max_frames, scaler, augment_type="noise") transforms_valid = get_transforms(cfg.max_frames, scaler=scaler) for i in range(len(list_dataset)): list_dataset[i].set_transform(transforms) validation_data.set_transform(transforms_valid) test_data.set_transform(transforms_valid) concat_dataset = ConcatDataset(list_dataset) sampler = MultiStreamBatchSampler(concat_dataset, batch_sizes=batch_sizes) training_data = DataLoader(concat_dataset, batch_sampler=sampler) # ############## # Model # ############## crnn_kwargs = cfg.crnn_kwargs crnn = CRNN(**crnn_kwargs) crnn_ema = CRNN(**crnn_kwargs) if path.exists(cfg.load_weights_fn): model_cfg = torch.load(cfg.load_weights_fn)
train_set = DataLoadDf(train_weak_df, many_hot_encoder.encode_weak, Compose(list_trans_fr), return_indexes=False) LOG.debug("len train : {}".format(len(train_set))) # train_load = DataLoader(train_set, batch_size=batch_size, num_workers=num_workers, shuffle=True, # drop_last=True, collate_fn=default_collate) # scaler = Scaler() scaler = ScalerSum() scaler.calculate_scaler(train_set) LOG.debug(scaler.mean_) list_trans_fr.append(Normalize(scaler)) train_set.set_transform(Compose(list_trans_fr)) # Validation data valid_weak_df = dfs["valid"] if valid_weak_df is not None: valid_set = DataLoadDf(valid_weak_df, many_hot_encoder.encode_weak, Compose(list_trans_fr), return_indexes=False) list_trans_val = deepcopy(list_trans_fr) if not args.segment: list_trans_val.append(Unsqueeze(0)) train_dl_emb = DataLoadDf(train_weak_df, many_hot_encoder.encode_weak, Compose(list_trans_val),
def datasets_classif(model, train_weak_embed, valid_weak_dl_fr, test_dl_fr, args, many_hot_encoder, classes, save_name="", eval_dl=None): encode_function_label = many_hot_encoder.encode_weak num_workers = cfg.num_workers model.eval() embed_dir = "stored_data/embeddings" embed_dir = os.path.join(embed_dir, save_name) create_folder(embed_dir) fig_dir = os.path.join(embed_dir, "figures") create_folder(fig_dir) if args.agg_time is not None: trans_embedding = [ToTensor(), View(-1)] else: trans_embedding = [ToTensor()] model = to_cuda_if_available(model) embed_set = "final" train_embed_dir = os.path.join(embed_dir, embed_set) df_weak, embed_weak = calculate_embedding(train_weak_embed, model, savedir=train_embed_dir, concatenate="append") weak_embed = DataLoadDf(df_weak, encode_function_label, transform=Compose(trans_embedding)) LOG.info(f"len weak embed: {len(weak_embed)}") weak_embed.set_transform(Compose(trans_embedding)) batch_size_classif = cfg.batch_size_classif df_valid, embed_valid = calculate_embedding(valid_weak_dl_fr, model, savedir=train_embed_dir, concatenate="append") valid_embed = DataLoadDf(df_valid, encode_function_label, transform=Compose(trans_embedding)) embed_set = "final_test" test_embed_dir = os.path.join(embed_dir, embed_set) df_test_embed, emb_test = calculate_embedding(test_dl_fr, model, savedir=test_embed_dir, concatenate="append") test_embed = DataLoadDf(df_test_embed, encode_function_label, transform=Compose(trans_embedding)) if args.balance: n_per_class = max(round(batch_size_classif / len(classes)), 1) weak_sampler = CategoriesSampler(weak_embed.df.event_labels, classes, n_per_class) weak_embed_loader = DataLoader(weak_embed, batch_sampler=weak_sampler, num_workers=num_workers) valid_sampler = CategoriesSampler(valid_embed.df.event_labels, classes, n_per_class) valid_embed_loader = DataLoader(valid_embed, batch_sampler=valid_sampler, num_workers=num_workers) test_sampler = CategoriesSampler(test_embed.df.event_labels, classes, n_per_class) test_embed_loader = DataLoader(test_embed, batch_sampler=test_sampler, num_workers=num_workers) else: weak_embed_loader = DataLoader(weak_embed, batch_size=batch_size_classif, num_workers=num_workers, shuffle=True, drop_last=True) valid_embed_loader = DataLoader(valid_embed, batch_size=batch_size_classif, shuffle=False, num_workers=num_workers, drop_last=False) test_embed_loader = DataLoader(test_embed, batch_size=batch_size_classif, shuffle=False, num_workers=num_workers, drop_last=False) if eval_dl is not None: model = to_cuda_if_available(model) embed_set = "final_eval" eval_embed_dir = os.path.join(embed_dir, embed_set) df_eval_embed, embed_eval = calculate_embedding(eval_dl, model, savedir=eval_embed_dir, concatenate="append") eval_embed = DataLoadDf(df_eval_embed, encode_function_label, transform=Compose(trans_embedding)) if args.balance: eval_sampler = CategoriesSampler(eval_embed.df.event_labels, classes, n_per_class) eval_embed_loader = DataLoader(eval_embed, batch_sampler=eval_sampler, num_workers=num_workers) else: eval_embed_loader = DataLoader(eval_embed, batch_size=batch_size_classif, shuffle=False, num_workers=num_workers, drop_last=False) else: eval_embed_loader = None model = to_cpu(model) return { "train": weak_embed_loader, "valid": valid_embed_loader, "test": test_embed_loader, "eval": eval_embed_loader }