def get_model_config(self, config): if config.model_type == 'davenet': self.audio_model = Davenet(input_dim=self.input_size, embedding_dim=1024) elif config.model_type == 'blstm': self.audio_model = BLSTM(512, input_size=self.input_size, n_layers=config.num_layers) self.image_model = nn.Linear(2048, 1024) self.attention_model = DotProductClassAttender( input_dim=1024, hidden_dim=1024, n_class=self.n_visual_class) if config.mode in ['test', 'align']: self.load_checkpoint()
def main(): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print(device) dataset = pickle2dict(input_dir + "features_glove.pkl") embeddings = pickle2dict(input_dir + "embeddings_glove.pkl") dataset["embeddings"] = embeddings emb_np = np.asarray(embeddings, dtype=np.float32) emb = torch.from_numpy(emb_np).to(device) blstm_model = BLSTM(embeddings=emb, input_dim=embsize, hidden_dim=hidden_size, num_layers=n_layers, output_dim=2, max_len=max_len, dropout=dropout) blstm_model = blstm_model.to(device) optimizer = optim.SGD(blstm_model.parameters(), lr=l_rate, weight_decay=1e-5) criterion = nn.CrossEntropyLoss() training_set = dataset["training"] training_set = YDataset(training_set["xIndexes"], training_set["yLabels"], to_pad=True, max_len=max_len) best_acc_test, best_acc_valid = -np.inf, -np.inf batches_per_epoch = int(len(training_set) / batch_size) for epoch in range(epochs): print("Epoch:{}".format(epoch)) for n_batch in range(batches_per_epoch): training_batch = training_set.next_batch(batch_size) train(blstm_model, training_batch, optimizer, criterion) acc_val = test(blstm_model, dataset, data_part="validation") acc_train = test(blstm_model, dataset, data_part="training") training_accuracy.append(acc_train) validation_accuracy.append(acc_val) print("The Training set prediction accuracy is {}".format(acc_train)) print("The validation set prediction accuracy is {}".format(acc_val)) print(" ")
def get_model_config(self, config): if config.model_type == 'blstm': self.audio_net = cuda( BLSTM(self.K, n_layers=self.n_layers, n_class=self.n_visual_class, input_size=80, ds_ratio=1, bidirectional=True), self.cuda) elif config.model_type == 'mlp': self.audio_net = cuda( MLP(self.K, n_layers=self.n_layers, n_class=self.n_visual_class, input_size=self.input_size, max_seq_len=self.max_segment_num), self.cuda) else: raise ValueError(f'Invalid model type {config.model_type}')
def __init__(self, idim, hdim, K, n_layers, dropout, lamb): super(Model, self).__init__() self.net = BLSTM(idim, hdim, n_layers, dropout=dropout) self.linear = nn.Linear(hdim * 2, K) self.loss_fn = CTC_CRF_LOSS(lamb=lamb)
def __init__(self, config): self.config = config self.cuda = torch.cuda.is_available() self.beta = 1. # XXX self.epoch = config.epoch self.batch_size = config.batch_size self.lr = config.lr self.n_layers = config.get('num_layers', 3) self.eps = 1e-9 if config.audio_feature == 'mfcc': self.audio_feature_net = None self.input_size = 80 self.hop_len_ms = 10 elif config.audio_feature == 'wav2vec2': self.audio_feature_net = cuda( fairseq.checkpoint_utils.load_model_ensemble_and_task( [config.wav2vec_path])[0][0], self.cuda) for p in self.audio_feature_net.parameters(): p.requires_grad = False self.input_size = 512 self.hop_len_ms = 20 elif config.audio_feature == 'cpc': self.audio_feature_net = None self.input_size = 256 self.hop_len_ms = 10 else: raise ValueError( f"Feature type {config.audio_feature} not supported") self.K = config.K self.global_iter = 0 self.global_epoch = 0 self.audio_feature = config.audio_feature self.image_feature = config.image_feature self.debug = config.debug self.dataset = config.dataset self.max_normalize = config.get('max_normalize', False) self.loss_type = config.get('loss_type', 'macro_token_floss') self.beta_f_measure = config.get('beta_f_measure', 0.3) self.weight_word_loss = config.get('weight_word_loss', 1.0) self.weight_phone_loss = config.get('weight_phone_loss', 0.0) self.ckpt_dir = Path(config.ckpt_dir) if not self.ckpt_dir.exists(): self.ckpt_dir.mkdir(parents=True, exist_ok=True) if self.loss_type == 'macro_token_floss': self.criterion = MacroTokenFLoss(beta=self.beta_f_measure) elif self.loss_type == 'binary_cross_entropy': self.criterion = nn.BCELoss() else: raise ValueError(f'Invalid loss type {self.loss_type}') # Dataset self.data_loader = return_data(config) self.ignore_index = config.get('ignore_index', -100) self.n_visual_class = self.data_loader['train']\ .dataset.preprocessor.num_visual_words self.n_phone_class = self.data_loader[ 'train'].dataset.preprocessor.num_tokens self.visual_words = self.data_loader[ 'train'].dataset.preprocessor.visual_words self.phone_set = self.data_loader['train'].dataset.preprocessor.tokens self.max_feat_len = self.data_loader['train'].dataset.max_feat_len self.max_word_len = self.data_loader['train'].dataset.max_word_len print(f'Number of visual label classes = {self.n_visual_class}') print(f'Number of phone classes = {self.n_phone_class}') print(f'Max normalized: {self.max_normalize}') self.audio_net = cuda( BLSTM(self.K, n_layers=self.n_layers, n_class=self.n_phone_class, input_size=self.input_size, ds_ratio=1, bidirectional=True), self.cuda) self.phone_net = cuda( HMMPronunciator(self.visual_words, self.phone_set, config=config, ignore_index=self.ignore_index), self.cuda) self.phone_net.train_model() self.align_net = cuda(LinearPositionAligner(scale=0.), self.cuda) # XXX trainables = [p for p in self.audio_net.parameters()] optim_type = config.get('optim', 'adam') if optim_type == 'sgd': self.optim = optim.SGD(trainables, lr=self.lr) else: self.optim = optim.Adam(trainables, lr=self.lr, betas=(0.5, 0.999)) self.scheduler = lr_scheduler.ExponentialLR(self.optim, gamma=0.97) self.load_ckpt = config.load_ckpt if self.load_ckpt or config.mode in ['test', 'cluster']: self.load_checkpoint() # History self.history = dict() self.history['token_f1'] = 0. self.history['visual_token_f1'] = 0. self.history['loss'] = 0. self.history['epoch'] = 0 self.history['iter'] = 0
def __init__(self, idim, hdim, K, n_layers, dropout): super(Model, self).__init__() self.net = BLSTM(idim, hdim, n_layers, dropout) self.linear = nn.Linear(hdim * 2, K)
def __init__(self, config): self.config = config self.cuda = torch.cuda.is_available() self.epoch = config.epoch self.batch_size = config.batch_size self.beta = config.beta self.lr = config.lr self.n_layers = config.get('num_layers', 1) self.weight_phone_loss = config.get('weight_phone_loss', 1.) self.weight_word_loss = config.get('weight_word_loss', 1.) self.anneal_rate = config.get('anneal_rate', 3e-6) self.num_sample = config.get('num_sample', 1) self.eps = 1e-9 self.max_grad_norm = config.get('max_grad_norm', None) if config.audio_feature == 'mfcc': self.audio_feature_net = None self.input_size = 80 self.hop_len_ms = 10 elif config.audio_feature == 'wav2vec2': self.audio_feature_net = cuda(fairseq.checkpoint_utils.load_model_ensemble_and_task([config.wav2vec_path])[0][0], self.cuda) for p in self.audio_feature_net.parameters(): p.requires_grad = False self.input_size = 512 self.hop_len_ms = 20 else: raise ValueError(f"Feature type {config.audio_feature} not supported") self.K = config.K self.global_iter = 0 self.global_epoch = 0 self.audio_feature = config.audio_feature self.image_feature = config.image_feature self.debug = config.debug self.dataset = config.dataset # Dataset self.data_loader = return_data(config) self.n_visual_class = self.data_loader['train']\ .dataset.preprocessor.num_visual_words self.n_phone_class = self.data_loader['train']\ .dataset.preprocessor.num_tokens self.visual_words = self.data_loader['train']\ .dataset.preprocessor.visual_words print(f'Number of visual label classes = {self.n_visual_class}') print(f'Number of phone classes = {self.n_phone_class}') self.model_type = config.model_type if config.model_type == 'gumbel_blstm': self.audio_net = cuda(GumbelBLSTM( self.K, input_size=self.input_size, n_layers=self.n_layers, n_class=self.n_visual_class, n_gumbel_units=self.n_phone_class, ds_ratio=1, bidirectional=True), self.cuda) self.K = 2 * self.K elif config.model_type == 'blstm': self.audio_net = cuda(BLSTM( self.K, input_size=self.input_size, n_layers=self.n_layers, n_class=self.n_visual_class+self.n_phone_class, bidirectional=True), self.cuda) self.K = 2 * self.K elif config.model_type == 'mlp': self.audio_net = cuda(GumbelMLP( self.K, input_size=self.input_size, n_class=self.n_visual_class, n_gumbel_units=self.n_phone_class, ), self.cuda) elif config.model_type == 'tds': self.audio_net = cuda(GumbelTDS( input_size=self.input_size, n_class=self.n_visual_class, n_gumbel_units=self.n_phone_class, ), self.cuda) elif config.model_type == 'vq-mlp': self.audio_net = cuda(VQMLP( self.K, input_size=self.input_size, n_class=self.n_visual_class, n_embeddings=self.n_phone_class ), self.cuda) trainables = [p for p in self.audio_net.parameters()] optim_type = config.get('optim', 'adam') if optim_type == 'sgd': self.optim = optim.SGD(trainables, lr=self.lr) else: self.optim = optim.Adam(trainables, lr=self.lr, betas=(0.5,0.999)) self.scheduler = lr_scheduler.ExponentialLR(self.optim, gamma=0.97) self.ckpt_dir = Path(config.ckpt_dir) if not self.ckpt_dir.exists(): self.ckpt_dir.mkdir(parents=True, exist_ok=True) self.load_ckpt = config.load_ckpt if self.load_ckpt or config.mode in ['test', 'cluster']: self.load_checkpoint() # History self.history = dict() self.history['acc']=0. self.history['token_f1']=0. self.history['loss']=0. self.history['epoch']=0 self.history['iter']=0 self.history['temp']=1.