def __init__(self,
                 config,
                 input_dim,
                 num_layers,
                 num_classes,
                 encoder_dim=None,
                 bert_pretrained=True,
                 bert_pretrained_model_name='bert-base-cased'):
        super().__init__()
        self.bert = get_bert(bert_pretrained, bert_pretrained_model_name)
        self.bert_asr = get_bert(bert_pretrained, bert_pretrained_model_name)

        self.aux_embedding = nn.Linear(config.enc_dim,
                                       self.bert.config.hidden_size
                                       )  #bert_hidden_size = 768 enc_dim = 128
        self.lugosch_model = lugosch.models.PretrainedModel(config)

        pretrained_model_path = os.path.join(config.libri_folder,
                                             "libri_pretraining",
                                             "model_state.pth")

        self.lugosch_model.load_state_dict(torch.load(pretrained_model_path))
        self.config = config

        # freeze phoneme and word layers
        self.freeze_all_layers()
        self.unfreezing_index = 1
        self.maxpool = MaskedMaxPool()
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes)
 def __init__(self, num_classes, pretrained=True):
     super().__init__()
     self.bert = get_bert(pretrained)
     self.classifier = nn.Sequential(
         nn.Dropout(0.3),
         nn.Linear(self.bert.config.hidden_size, num_classes)
     )
Example #3
0
    def __init__(self,
                 config,
                 input_dim,
                 num_layers,
                 num_classes,
                 encoder_dim=None,
                 bert_pretrained=True,
                 bert_pretrained_model_name='bert-base-cased'):
        super().__init__()
        self.bert = get_bert(bert_pretrained, bert_pretrained_model_name)

        #model 3 is bert trained on GT and ASR
        if config.bert_dir:
            print(
                f"loading model3 (bert pretrained on GT and ASR) from {config.bert_dir}"
            )
            chkpt_path = os.path.join(config.bert_dir, 'best_ckpt.pth')
            model_dict = self.bert.state_dict()
            pretrained_dict = torch.load(chkpt_path)
            pretrained_dict = {
                k.split(".", 1)[1]: v
                for k, v in pretrained_dict.items()
                if k.split(".", 1)[1] in model_dict
            }
            self.bert.load_state_dict(pretrained_dict)

        ### Comment out Alexa's encoder

        self.aux_embedding = nn.Linear(config.enc_dim,
                                       self.bert.config.hidden_size
                                       )  #bert_hidden_size = 768 enc_dim = 128
        self.lugosch_model = lugosch.models.PretrainedModel(config)

        pretrained_model_path = os.path.join(config.libri_folder,
                                             "libri_pretraining",
                                             "model_state.pth")

        self.lugosch_model.load_state_dict(torch.load(pretrained_model_path))
        self.config = config

        # freeze phoneme and word layers
        self.freeze_all_layers()
        self.unfreezing_index = 1
        self.maxpool = MaskedMaxPool()
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes)
Example #4
0
 def __init__(self,
              input_dim,
              num_layers,
              num_classes,
              encoder_dim=None,
              bert_pretrained=True,
              bert_pretrained_model_name='bert-base-cased'):
     super().__init__()
     self.bert = get_bert(bert_pretrained, bert_pretrained_model_name)
     self.encoder_dim = encoder_dim
     if encoder_dim is None:
         self.speech_encoder = SubsampledBiLSTMEncoder(
             input_dim=input_dim,
             encoder_dim=self.bert.config.hidden_size // 2,
             num_layers=num_layers)
     else:
         self.speech_encoder = SubsampledBiLSTMEncoder(
             input_dim=input_dim,
             encoder_dim=encoder_dim,
             num_layers=num_layers)
         self.aux_embedding = nn.Linear(2 * encoder_dim,
                                        self.bert.config.hidden_size)
     self.maxpool = MaskedMaxPool()
     self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes)
    def __init__(self,
                 config,
                 input_dim,
                 num_layers,
                 num_classes,
                 encoder_dim=None,
                 bert_pretrained=True,
                 bert_pretrained_model_name='bert-base-cased'):
        super().__init__()
        self.bert = get_bert(bert_pretrained, bert_pretrained_model_name)

        ### Comment out Alexa's encoder
        #         self.encoder_dim = encoder_dim
        #         if encoder_dim is None:
        #             self.speech_encoder = SubsampledBiLSTMEncoder(input_dim=input_dim, encoder_dim=self.bert.config.hidden_size//2, num_layers=num_layers)
        #         else:
        #             self.speech_encoder = SubsampledBiLSTMEncoder(input_dim=input_dim, encoder_dim=encoder_dim, num_layers=num_layers)

        self.aux_embedding = nn.Linear(config.enc_dim,
                                       self.bert.config.hidden_size
                                       )  #bert_hidden_size = 768 enc_dim = 128
        self.aux_reverse = nn.Linear(
            self.bert.config.hidden_size,
            config.enc_dim)  #match bert hidden size with Lugosch's
        self.lugosch_model = lugosch.models.PretrainedModel(config)

        pretrained_model_path = os.path.join(config.libri_folder,
                                             "libri_pretraining",
                                             "model_state.pth")

        self.lugosch_model.load_state_dict(torch.load(pretrained_model_path))
        self.config = config

        # freeze phoneme and word layers
        self.freeze_all_layers()
        self.unfreezing_index = 1
        self.maxpool = MaskedMaxPool()
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes)

        # Based on Lugosch's Intent Module (class Model in models.py)
        self.intent_layers = []
        self.num_values_total = num_classes  #default for fluentai

        self.num_rnn_layers = len(config.intent_rnn_num_hidden)
        self.out_dim = config.word_rnn_num_hidden[-1]
        if config.word_rnn_bidirectional:
            self.out_dim *= 2
        for idx in range(self.num_rnn_layers):
            # recurrent
            print("config.intent_rnn_bidirectional :",
                  config.intent_rnn_bidirectional)

            layer = torch.nn.GRU(input_size=self.out_dim,
                                 hidden_size=config.intent_rnn_num_hidden[idx],
                                 batch_first=True,
                                 bidirectional=config.intent_rnn_bidirectional)
            layer.name = "intent_rnn%d" % idx
            self.intent_layers.append(layer)

            self.out_dim = config.intent_rnn_num_hidden[idx]
            if config.intent_rnn_bidirectional:
                self.out_dim *= 2

            # grab hidden states of RNN for each timestep
            layer = RNNSelect()
            layer.name = "intent_rnn_select%d" % idx
            self.intent_layers.append(layer)

            # dropout
            layer = torch.nn.Dropout(p=config.intent_rnn_drop[idx])
            layer.name = "intent_dropout%d" % idx
            self.intent_layers.append(layer)

            # downsample
            layer = Downsample(method=config.intent_downsample_type[idx],
                               factor=config.intent_downsample_len[idx],
                               axis=1)
            layer.name = "intent_downsample%d" % idx
            self.intent_layers.append(layer)

        layer = torch.nn.Linear(self.out_dim, self.num_values_total)
        layer.name = "final_classifier"
        self.intent_layers.append(layer)

        layer = FinalPool()  #maxpool 3D - 2D
        layer.name = "final_pool"
        self.intent_layers.append(layer)

        self.lugosch_classifier = torch.nn.ModuleList(self.intent_layers)