def get_model_config(self, config):
     if config.model_type == 'davenet':
         self.audio_model = Davenet(input_dim=self.input_size,
                                    embedding_dim=1024)
     elif config.model_type == 'blstm':
         self.audio_model = BLSTM(512,
                                  input_size=self.input_size,
                                  n_layers=config.num_layers)
     self.image_model = nn.Linear(2048, 1024)
     self.attention_model = DotProductClassAttender(
         input_dim=1024, hidden_dim=1024, n_class=self.n_visual_class)
     if config.mode in ['test', 'align']:
         self.load_checkpoint()
Exemple #2
0
def main():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(device)

    dataset = pickle2dict(input_dir + "features_glove.pkl")
    embeddings = pickle2dict(input_dir + "embeddings_glove.pkl")
    dataset["embeddings"] = embeddings

    emb_np = np.asarray(embeddings, dtype=np.float32)
    emb = torch.from_numpy(emb_np).to(device)

    blstm_model = BLSTM(embeddings=emb,
                        input_dim=embsize,
                        hidden_dim=hidden_size,
                        num_layers=n_layers,
                        output_dim=2,
                        max_len=max_len,
                        dropout=dropout)

    blstm_model = blstm_model.to(device)

    optimizer = optim.SGD(blstm_model.parameters(),
                          lr=l_rate,
                          weight_decay=1e-5)
    criterion = nn.CrossEntropyLoss()

    training_set = dataset["training"]
    training_set = YDataset(training_set["xIndexes"],
                            training_set["yLabels"],
                            to_pad=True,
                            max_len=max_len)

    best_acc_test, best_acc_valid = -np.inf, -np.inf
    batches_per_epoch = int(len(training_set) / batch_size)

    for epoch in range(epochs):
        print("Epoch:{}".format(epoch))
        for n_batch in range(batches_per_epoch):
            training_batch = training_set.next_batch(batch_size)
            train(blstm_model, training_batch, optimizer, criterion)
        acc_val = test(blstm_model, dataset, data_part="validation")
        acc_train = test(blstm_model, dataset, data_part="training")
        training_accuracy.append(acc_train)
        validation_accuracy.append(acc_val)
        print("The Training set prediction accuracy is {}".format(acc_train))
        print("The validation set prediction accuracy is {}".format(acc_val))
        print(" ")
Exemple #3
0
    def get_model_config(self, config):
        if config.model_type == 'blstm':
            self.audio_net = cuda(
                BLSTM(self.K,
                      n_layers=self.n_layers,
                      n_class=self.n_visual_class,
                      input_size=80,
                      ds_ratio=1,
                      bidirectional=True), self.cuda)

        elif config.model_type == 'mlp':
            self.audio_net = cuda(
                MLP(self.K,
                    n_layers=self.n_layers,
                    n_class=self.n_visual_class,
                    input_size=self.input_size,
                    max_seq_len=self.max_segment_num), self.cuda)
        else:
            raise ValueError(f'Invalid model type {config.model_type}')
Exemple #4
0
 def __init__(self, idim, hdim, K, n_layers, dropout, lamb):
     super(Model, self).__init__()
     self.net = BLSTM(idim, hdim, n_layers, dropout=dropout)
     self.linear = nn.Linear(hdim * 2, K)
     self.loss_fn = CTC_CRF_LOSS(lamb=lamb)
    def __init__(self, config):
        self.config = config

        self.cuda = torch.cuda.is_available()
        self.beta = 1.  # XXX
        self.epoch = config.epoch
        self.batch_size = config.batch_size
        self.lr = config.lr
        self.n_layers = config.get('num_layers', 3)
        self.eps = 1e-9
        if config.audio_feature == 'mfcc':
            self.audio_feature_net = None
            self.input_size = 80
            self.hop_len_ms = 10
        elif config.audio_feature == 'wav2vec2':
            self.audio_feature_net = cuda(
                fairseq.checkpoint_utils.load_model_ensemble_and_task(
                    [config.wav2vec_path])[0][0], self.cuda)
            for p in self.audio_feature_net.parameters():
                p.requires_grad = False
            self.input_size = 512
            self.hop_len_ms = 20
        elif config.audio_feature == 'cpc':
            self.audio_feature_net = None
            self.input_size = 256
            self.hop_len_ms = 10
        else:
            raise ValueError(
                f"Feature type {config.audio_feature} not supported")

        self.K = config.K
        self.global_iter = 0
        self.global_epoch = 0
        self.audio_feature = config.audio_feature
        self.image_feature = config.image_feature
        self.debug = config.debug
        self.dataset = config.dataset
        self.max_normalize = config.get('max_normalize', False)
        self.loss_type = config.get('loss_type', 'macro_token_floss')
        self.beta_f_measure = config.get('beta_f_measure', 0.3)
        self.weight_word_loss = config.get('weight_word_loss', 1.0)
        self.weight_phone_loss = config.get('weight_phone_loss', 0.0)
        self.ckpt_dir = Path(config.ckpt_dir)
        if not self.ckpt_dir.exists():
            self.ckpt_dir.mkdir(parents=True, exist_ok=True)

        if self.loss_type == 'macro_token_floss':
            self.criterion = MacroTokenFLoss(beta=self.beta_f_measure)
        elif self.loss_type == 'binary_cross_entropy':
            self.criterion = nn.BCELoss()
        else:
            raise ValueError(f'Invalid loss type {self.loss_type}')

        # Dataset
        self.data_loader = return_data(config)
        self.ignore_index = config.get('ignore_index', -100)
        self.n_visual_class = self.data_loader['train']\
                              .dataset.preprocessor.num_visual_words
        self.n_phone_class = self.data_loader[
            'train'].dataset.preprocessor.num_tokens
        self.visual_words = self.data_loader[
            'train'].dataset.preprocessor.visual_words
        self.phone_set = self.data_loader['train'].dataset.preprocessor.tokens
        self.max_feat_len = self.data_loader['train'].dataset.max_feat_len
        self.max_word_len = self.data_loader['train'].dataset.max_word_len
        print(f'Number of visual label classes = {self.n_visual_class}')
        print(f'Number of phone classes = {self.n_phone_class}')
        print(f'Max normalized: {self.max_normalize}')

        self.audio_net = cuda(
            BLSTM(self.K,
                  n_layers=self.n_layers,
                  n_class=self.n_phone_class,
                  input_size=self.input_size,
                  ds_ratio=1,
                  bidirectional=True), self.cuda)

        self.phone_net = cuda(
            HMMPronunciator(self.visual_words,
                            self.phone_set,
                            config=config,
                            ignore_index=self.ignore_index), self.cuda)
        self.phone_net.train_model()
        self.align_net = cuda(LinearPositionAligner(scale=0.),
                              self.cuda)  # XXX

        trainables = [p for p in self.audio_net.parameters()]
        optim_type = config.get('optim', 'adam')
        if optim_type == 'sgd':
            self.optim = optim.SGD(trainables, lr=self.lr)
        else:
            self.optim = optim.Adam(trainables, lr=self.lr, betas=(0.5, 0.999))
        self.scheduler = lr_scheduler.ExponentialLR(self.optim, gamma=0.97)
        self.load_ckpt = config.load_ckpt
        if self.load_ckpt or config.mode in ['test', 'cluster']:
            self.load_checkpoint()

        # History
        self.history = dict()
        self.history['token_f1'] = 0.
        self.history['visual_token_f1'] = 0.
        self.history['loss'] = 0.
        self.history['epoch'] = 0
        self.history['iter'] = 0
Exemple #6
0
 def __init__(self, idim, hdim, K, n_layers, dropout):
     super(Model, self).__init__()
     self.net = BLSTM(idim, hdim, n_layers, dropout)
     self.linear = nn.Linear(hdim * 2, K)
Exemple #7
0
  def __init__(self, config):
    self.config = config

    self.cuda = torch.cuda.is_available()
    self.epoch = config.epoch
    self.batch_size = config.batch_size
    self.beta = config.beta
    self.lr = config.lr
    self.n_layers = config.get('num_layers', 1)
    self.weight_phone_loss = config.get('weight_phone_loss', 1.)
    self.weight_word_loss = config.get('weight_word_loss', 1.)
    self.anneal_rate = config.get('anneal_rate', 3e-6)
    self.num_sample = config.get('num_sample', 1)
    self.eps = 1e-9
    self.max_grad_norm = config.get('max_grad_norm', None)
    if config.audio_feature == 'mfcc':
      self.audio_feature_net = None
      self.input_size = 80
      self.hop_len_ms = 10

    elif config.audio_feature == 'wav2vec2':
      self.audio_feature_net = cuda(fairseq.checkpoint_utils.load_model_ensemble_and_task([config.wav2vec_path])[0][0],
                                    self.cuda)
      for p in self.audio_feature_net.parameters():
        p.requires_grad = False
      self.input_size = 512
      self.hop_len_ms = 20 
    else:
      raise ValueError(f"Feature type {config.audio_feature} not supported")
   
    self.K = config.K
    self.global_iter = 0
    self.global_epoch = 0
    self.audio_feature = config.audio_feature
    self.image_feature = config.image_feature
    self.debug = config.debug
    self.dataset = config.dataset

    # Dataset
    self.data_loader = return_data(config)
    self.n_visual_class = self.data_loader['train']\
                          .dataset.preprocessor.num_visual_words
    self.n_phone_class = self.data_loader['train']\
                         .dataset.preprocessor.num_tokens
    self.visual_words = self.data_loader['train']\
                        .dataset.preprocessor.visual_words 
    print(f'Number of visual label classes = {self.n_visual_class}')
    print(f'Number of phone classes = {self.n_phone_class}')
  
    self.model_type = config.model_type 
    if config.model_type == 'gumbel_blstm':
      self.audio_net = cuda(GumbelBLSTM(
                              self.K,
                              input_size=self.input_size,
                              n_layers=self.n_layers,
                              n_class=self.n_visual_class,
                              n_gumbel_units=self.n_phone_class,
                              ds_ratio=1,
                              bidirectional=True), self.cuda)
      self.K = 2 * self.K
    elif config.model_type == 'blstm':
      self.audio_net = cuda(BLSTM(
        self.K,
        input_size=self.input_size,
        n_layers=self.n_layers,
        n_class=self.n_visual_class+self.n_phone_class,
        bidirectional=True), self.cuda)
      self.K = 2 * self.K
    elif config.model_type == 'mlp':
      self.audio_net = cuda(GumbelMLP(
                                self.K,
                                input_size=self.input_size,
                                n_class=self.n_visual_class,
                                n_gumbel_units=self.n_phone_class,
                            ), self.cuda)
    elif config.model_type == 'tds':
      self.audio_net = cuda(GumbelTDS(
                              input_size=self.input_size,
                              n_class=self.n_visual_class,
                              n_gumbel_units=self.n_phone_class,
                            ), self.cuda)
    elif config.model_type == 'vq-mlp':
      self.audio_net = cuda(VQMLP(
                              self.K,
                              input_size=self.input_size,
                              n_class=self.n_visual_class,
                              n_embeddings=self.n_phone_class
                            ), self.cuda) 
  
    trainables = [p for p in self.audio_net.parameters()]
    optim_type = config.get('optim', 'adam')
    if optim_type == 'sgd':
      self.optim = optim.SGD(trainables, lr=self.lr)
    else:
      self.optim = optim.Adam(trainables,
                              lr=self.lr, betas=(0.5,0.999))
    self.scheduler = lr_scheduler.ExponentialLR(self.optim, gamma=0.97)
    self.ckpt_dir = Path(config.ckpt_dir)
    if not self.ckpt_dir.exists(): 
      self.ckpt_dir.mkdir(parents=True, exist_ok=True)
    self.load_ckpt = config.load_ckpt
    if self.load_ckpt or config.mode in ['test', 'cluster']: 
      self.load_checkpoint()
    
    # History
    self.history = dict()
    self.history['acc']=0. 
    self.history['token_f1']=0.
    self.history['loss']=0.
    self.history['epoch']=0
    self.history['iter']=0
    self.history['temp']=1.