Esempio n. 1
0
    def __init__(self,
                 enc_inp_size,
                 d_latent,
                 N=6,
                 d_model=512,
                 d_ff=2048,
                 h=8,
                 dropout=0.1,
                 device='cpu',
                 d_map_latent=8):
        super(EncoderY, self).__init__()
        self.d_model = d_model
        self.embed_fn = nn.Sequential(
            LinearEmbedding(enc_inp_size, d_model - d_map_latent),
            PositionalEncoding(d_model - d_map_latent, dropout))
        self.encoder = Encoder(
            EncoderLayer(d_model, MultiHeadAttention(h, d_model),
                         PointerwiseFeedforward(d_model, d_ff, dropout),
                         dropout), N)
        self.fc = nn.Linear(d_model, d_latent)

        self.init_weights(self.encoder.parameters())
        self.init_weights(self.fc.parameters())

        self.map_encoder = load_map_encoder(device)
    def create_model(cls, args):
        from transformer.conv_encoder import Conv2dSubsample
        from transformer.encoder import Encoder
        from transformer.attentionAssigner import Attention_Assigner
        # from transformer.attentionAssigner import Attention_Assigner_RNN as Attention_Assigner
        from transformer.decoder import Decoder_CIF as Decoder

        conv_encoder = Conv2dSubsample(d_input=args.d_input * args.LFR_m,
                                       d_model=args.d_model,
                                       n_layers=args.n_conv_layers)
        encoder = Encoder(d_input=args.d_model,
                          n_layers=args.n_layers_enc,
                          n_head=args.n_head,
                          d_model=args.d_model,
                          d_inner=args.d_inner,
                          dropout=args.dropout)
        assigner = Attention_Assigner(d_input=args.d_model,
                                      d_hidden=args.d_assigner_hidden,
                                      w_context=args.w_context,
                                      n_layers=args.n_assigner_layers)
        decoder = Decoder(sos_id=args.sos_id,
                          n_tgt_vocab=args.vocab_size,
                          n_layers=args.n_layers_dec,
                          n_head=args.n_head,
                          d_model=args.d_model,
                          d_inner=args.d_inner,
                          dropout=args.dropout)
        model = cls(conv_encoder, encoder, assigner, decoder,
                    args.spec_aug_cfg)

        return model
Esempio n. 3
0
    def __init__(self,
                 src_len,
                 tgt_len,
                 enc_inp_size,
                 dec_inp_size,
                 dec_out_size,
                 N=6,
                 d_model=512,
                 d_ff=2048,
                 h=8,
                 dropout=0.1,
                 device='cpu'):
        super(Generator, self).__init__()
        self.device = device
        self.src_len = src_len
        self.tgt_len = tgt_len
        self.dec_inp_size = dec_inp_size

        c = copy.deepcopy
        attn = MultiHeadAttention(h, d_model)
        ff = PointerwiseFeedforward(d_model, d_ff, dropout)
        position = PositionalEncoding(d_model, dropout)
        self.generator = EncoderDecoder(
            Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
            Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout),
                    N),
            nn.Sequential(LinearEmbedding(enc_inp_size, d_model), c(position)),
            nn.Sequential(LinearEmbedding(dec_inp_size, d_model), c(position)),
            TFHeadGenerator(d_model, dec_out_size))

        # This was important from their code.
        # Initialize parameters with Glorot / fan_avg.
        for p in self.generator.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
Esempio n. 4
0
def make_model(
    src_vocab: int,
    tgt_vocab: int,
    n: int = 6,
    d_model: int = 512,
    d_ff: int = 2048,
    h: int = 8,
    dropout: float = 0.1,
    device: torch.device = torch.device("cpu"),
) -> EncoderDecoder:
    """Helper: Construct a model from hyperparameters."""
    c = copy.deepcopy
    attn = MultiHeadedAttention(h, d_model)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionalEncoding(d_model, dropout)
    model = EncoderDecoder(
        Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), n),
        Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), n),
        nn.Sequential(Embeddings(d_model, src_vocab), c(position)),
        nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)),
        Generator(d_model, tgt_vocab),
    ).to(device)

    # This was important from their code.
    # Initialize parameters with Glorot / fan_avg.
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    return model
Esempio n. 5
0
    def __init__(self,
                 enc_inp_size,
                 dec_inp_size,
                 dec_out_size,
                 N=6,
                 d_model=512,
                 d_ff=2048,
                 heads=8,
                 dropout=0.1,
                 mean=[0, 0],
                 std=[0, 0]):
        super(IndividualTF, self).__init__()
        "Helper: Construct a model from hyperparameters."
        c = copy.deepcopy
        attn = MultiHeadAttention(heads, d_model)
        ff = PointerwiseFeedforward(d_model, d_ff, dropout)
        position = PositionalEncoding(d_model, dropout)
        self.mean = np.array(mean)
        self.std = np.array(std)

        self.model = EncoderDecoder(
            Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
            Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout),
                    N),
            nn.Sequential(LinearEmbedding(enc_inp_size, d_model), c(position)),
            nn.Sequential(LinearEmbedding(dec_inp_size, d_model), c(position)),
            Generator(d_model, dec_out_size))

        # This was important from their code.
        # Initialize parameters with Glorot / fan_avg.
        for p in self.model.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
    def create_model(cls, args):
        from transformer.decoder import Decoder
        from transformer.encoder import Encoder
        from transformer.conv_encoder import Conv2dSubsample

        conv_encoder = Conv2dSubsample(d_input=args.d_input * args.LFR_m,
                                       d_model=args.d_model,
                                       n_layers=args.n_conv_layers)
        encoder = Encoder(d_input=args.d_model,
                          n_layers=args.n_layers_enc,
                          n_head=args.n_head,
                          d_model=args.d_model,
                          d_inner=args.d_inner,
                          dropout=args.dropout)
        decoder = Decoder(sos_id=args.sos_id,
                          eos_id=args.eos_id,
                          n_tgt_vocab=args.vocab_size,
                          n_layers=args.n_layers_dec,
                          n_head=args.n_head,
                          d_model=args.d_model,
                          d_inner=args.d_inner,
                          dropout=args.dropout)

        model = cls(conv_encoder,
                    encoder,
                    decoder,
                    spec_aug_cfg=args.spec_aug_cfg)

        return model
Esempio n. 7
0
    def __init__(self,
                 disc_inp_size,
                 disc_seq_len,
                 N=6,
                 d_model=512,
                 d_ff=2048,
                 h=8,
                 dropout=0.1,
                 device='cpu'):
        super(Critic, self).__init__()

        self.device = device

        c = copy.deepcopy
        attn = MultiHeadAttention(h, d_model)
        ff = PointerwiseFeedforward(d_model, d_ff, dropout)
        position = PositionalEncoding(d_model, dropout)
        self.critic = nn.ModuleDict({
            'src_embed':
            nn.Sequential(LinearEmbedding(disc_inp_size, d_model),
                          c(position)),
            'encoder':
            Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
            'disc_head':
            nn.Sequential(nn.Flatten(), nn.Linear(d_model * disc_seq_len, 1)),
        })

        for p in self.critic.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
Esempio n. 8
0
 def __init__(self, vocabulary_size_in, vocabulary_size_out, constants, hyperparams):
     super(Transformer, self).__init__()
     self.constants = constants
     self.max_seq = hyperparams.MAX_SEQ
     self.EmbeddingSrc = Embedding(vocabulary_size=vocabulary_size_in, d_model=hyperparams.D_MODEL, constants=constants)
     self.EmbeddingTgt = Embedding(vocabulary_size=vocabulary_size_out, d_model=hyperparams.D_MODEL, constants=constants)
     self.Encoder = Encoder(nb_layers=hyperparams.NB_LAYERS, nb_heads=hyperparams.NB_HEADS, d_model=hyperparams.D_MODEL, nb_neurons=hyperparams.NB_NEURONS, dropout=hyperparams.DROPOUT)
     self.Decoder = Decoder(nb_layers=hyperparams.NB_LAYERS, nb_heads=hyperparams.NB_HEADS, d_model=hyperparams.D_MODEL, nb_neurons=hyperparams.NB_NEURONS, dropout=hyperparams.DROPOUT)
     self.Linear = nn.Linear(hyperparams.D_MODEL, vocabulary_size_out, bias=False)
     if hyperparams.SHARE_WEIGHTS:
         self.EmbeddingSrc.lookup_table.weight = self.Linear.weight
         self.EmbeddingTgt.lookup_table.weight = self.Linear.weight
Esempio n. 9
0
    def test_forward(self):
        enc_layer = EncoderLayer(
            size=512,
            self_attention=MultiHeadAttention(n_head=8,
                                              d_model=512,
                                              d_k=64,
                                              d_v=64,
                                              dropout=0.1),
            feed_forward=PositionwiseFeedForward(d_model=512,
                                                 d_ff=2048,
                                                 dropout=0.1),
            dropout=0.1)

        encoder = Encoder(layer=enc_layer, n_layers=6)
        x = torch.ones((64, 10, 512))

        out = encoder.forward(x, mask=None, verbose=False)

        self.assertIsInstance(out, torch.Tensor)
        self.assertEqual(out.shape, x.shape)
        # check no nan values
        self.assertEqual(torch.isnan(out).sum(), 0)
Esempio n. 10
0
class EncoderX(nn.Module):
    def __init__(self,
                 enc_inp_size,
                 d_latent,
                 N=6,
                 d_model=512,
                 d_ff=2048,
                 h=8,
                 dropout=0.1,
                 device='cpu',
                 d_map_latent=8):
        super(EncoderX, self).__init__()

        self.d_model = d_model
        self.embed_fn = nn.Sequential(
            LinearEmbedding(enc_inp_size, d_model - d_map_latent),
            PositionalEncoding(d_model - d_map_latent, dropout))
        self.encoder = Encoder(
            EncoderLayer(d_model, MultiHeadAttention(h, d_model),
                         PointerwiseFeedforward(d_model, d_ff, dropout),
                         dropout), N)
        self.fc = nn.Linear(d_model, d_latent)

        self.init_weights(self.encoder.parameters())
        self.init_weights(self.fc.parameters())

        self.map_encoder = load_map_encoder(device)

    def init_weights(self, params):
        for p in params:
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)

    def forward(self, src, src_mask, map):
        # map = map.view(-1, map.shape[2], map.shape[3], map.shape[4])
        map_feat = self.map_encoder(src.reshape(-1, src.shape[2]),
                                    map.reshape(-1, map.shape[2], map.shape[3],
                                                map.shape[4]),
                                    train=False)
        map_feat = map_feat.reshape((-1, 8, map_feat.shape[-1]))

        logit_token = Variable(
            torch.FloatTensor(np.random.rand(src.shape[0], 1,
                                             self.d_model))).to(src.device)
        src_emb = torch.cat((self.embed_fn(src), map_feat), dim=-1)
        logit_src_emb = torch.cat((logit_token, src_emb), dim=1)

        enc_out = self.encoder(logit_src_emb, src_mask)  # bs, 1+8, 512
        logit = self.fc(enc_out[:, 0])

        return enc_out[:, 1:], logit
Esempio n. 11
0
 def __init__(self,
              num_layers,
              d_model,
              num_heads,
              dff,
              input_vocab_size,
              target_vocab_size,
              pe_input,
              pe_target,
              rate=0.1):
     super(Transformer, self).__init__()
     self.encoder = Encoder(num_layers, d_model, num_heads, dff,
                            input_vocab_size, pe_input, rate)
     self.decoder = Decoder(num_layers, d_model, num_heads, dff,
                            target_vocab_size, pe_target, rate)
     self.final_layer = tf.keras.layers.Dense(target_vocab_size)
Esempio n. 12
0
    def __init__(self,
                 n_src_vocab,
                 n_trg_vocab,
                 src_pad_idx,
                 trg_pad_idx,
                 d_word_vec=256,
                 d_model=256,
                 d_inner=512,
                 n_layer=3,
                 n_head=8,
                 dropout=0.1,
                 n_position=200):
        super(Transformer, self).__init__()
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx

        self.encoder = Encoder(n_src_vocab=n_src_vocab,
                               d_word_vec=d_word_vec,
                               d_model=d_model,
                               d_inner=d_inner,
                               n_layer=n_layer,
                               n_head=n_head,
                               pad_idx=src_pad_idx,
                               dropout=dropout,
                               n_position=n_position,
                               max_seq_len=32)
        self.decoder = Decoder(n_trg_vocab=n_trg_vocab,
                               d_word_vec=d_word_vec,
                               d_model=d_model,
                               d_inner=d_inner,
                               n_layer=n_layer,
                               n_head=n_head,
                               pad_idx=trg_pad_idx,
                               n_position=n_position,
                               dropout=dropout)

        self.trg_word_prj = nn.Linear(d_model, n_trg_vocab, bias=False)

        # for name, param in self.named_parameters():
        #     if param.dim() > 1:
        #         nn.init.xavier_normal(param)

        for param in self.parameters():
            if param.dim() > 1:
                nn.init.xavier_uniform_(param)
Esempio n. 13
0
    def __init__(self,
                 *args,
                 embedding_rank=None,
                 inner_rank=None,
                 ffward_rank=None,
                 **kwargs):
        # Run super constructor from NMTModel, but don't run NMTModel.__init__
        super(NMTModel, self).__init__()
        self.vocab = pickle.load(open(paths.vocab, 'rb'))

        if embedding_rank is None:
            embedding_rank = transformer_config.embedding_rank
        if inner_rank is None:
            inner_rank = transformer_config.inner_rank
        if ffward_rank is None:
            ffward_rank = transformer_config.ffward_rank
        print(transformer_config.embedding_factorization,
              transformer_config.inner_factorization,
              transformer_config.ffward_factorization)
        print(embedding_rank, inner_rank, ffward_rank)
        self.encoder = Encoder(len(self.vocab.src), embedding_rank, inner_rank,
                               ffward_rank)
        self.decoder = Decoder(len(self.vocab.tgt), embedding_rank, inner_rank,
                               ffward_rank)

        self.gpu = False
        self.initialize()

        self.optimizer = NoamOpt(transformer_config.layer_dimension,
                                 train_config.lr,
                                 4000,
                                 Adam(
                                     self.parameters(),
                                     lr=0,
                                     betas=(0.9, 0.98),
                                     eps=1e-9,
                                 ),
                                 beginning_step=0)

        self.num_accumulations = 0
        self.accumulate = max(1, train_config.accumulate)
    def create_model(cls, args):
        from transformer.decoder import Decoder
        from transformer.encoder import Encoder

        encoder = Encoder(d_input=args.d_input * args.LFR_m,
                          n_layers=args.n_layers_enc,
                          n_head=args.n_head,
                          d_model=args.d_model,
                          d_inner=args.d_inner,
                          dropout=args.dropout)
        decoder = Decoder(sos_id=args.sos_id,
                          eos_id=args.eos_id,
                          n_tgt_vocab=args.vocab_size,
                          n_layers=args.n_layers_dec,
                          n_head=args.n_head,
                          d_model=args.d_model,
                          d_inner=args.d_inner,
                          dropout=args.dropout)

        model = cls.create_model(encoder, decoder)

        return model
    def __init__(self,
                 src_vocab_size,
                 tgt_vocab_size,
                 device,
                 d_model=512,
                 p_dropout=0.1):
        super(Transformer, self).__init__()
        self.d_model = d_model

        self.src_embedding = nn.Embedding(src_vocab_size, d_model)
        self.positional_encoder1 = PositionalEncoder(device,
                                                     d_model=d_model,
                                                     p_dropout=p_dropout)
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoder2 = PositionalEncoder(device,
                                                     d_model=d_model,
                                                     p_dropout=p_dropout)
        self.encoder = Encoder(6, d_model)
        self.decoder = Decoder(6, d_model)
        self.linear = nn.Linear(d_model, tgt_vocab_size)
        self.softmax = nn.LogSoftmax(dim=-1)

        # Share weights
        self.linear.weight = self.tgt_embedding.weight
Esempio n. 16
0
def train_net(args):
    torch.manual_seed(7)
    np.random.seed(7)
    checkpoint = args.checkpoint
    start_epoch = 0
    best_loss = float('inf')
    writer = SummaryWriter()
    epochs_since_improvement = 0

    # Initialize / load checkpoint
    if checkpoint is None:
        # model
        encoder = Encoder(n_src_vocab,
                          args.n_layers_enc,
                          args.n_head,
                          args.d_k,
                          args.d_v,
                          args.d_model,
                          args.d_inner,
                          dropout=args.dropout,
                          pe_maxlen=args.pe_maxlen)
        decoder = Decoder(
            sos_id,
            eos_id,
            n_tgt_vocab,
            args.d_word_vec,
            args.n_layers_dec,
            args.n_head,
            args.d_k,
            args.d_v,
            args.d_model,
            args.d_inner,
            dropout=args.dropout,
            tgt_emb_prj_weight_sharing=args.tgt_emb_prj_weight_sharing,
            pe_maxlen=args.pe_maxlen)
        model = Transformer(encoder, decoder)
        # print(model)
        # model = nn.DataParallel(model)

        # optimizer
        optimizer = TransformerOptimizer(
            torch.optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-09))

    else:
        checkpoint = torch.load(checkpoint)
        start_epoch = checkpoint['epoch'] + 1
        epochs_since_improvement = checkpoint['epochs_since_improvement']
        model = checkpoint['model']
        optimizer = checkpoint['optimizer']

    # Move to GPU, if available
    model = model.to(device)

    # Custom dataloaders
    train_dataset = AiChallenger2017Dataset('train')
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               collate_fn=pad_collate,
                                               shuffle=True,
                                               num_workers=args.num_workers)
    valid_dataset = AiChallenger2017Dataset('valid')
    valid_loader = torch.utils.data.DataLoader(valid_dataset,
                                               batch_size=args.batch_size,
                                               collate_fn=pad_collate,
                                               shuffle=False,
                                               num_workers=args.num_workers)

    # Epochs
    for epoch in range(start_epoch, args.epochs):
        # One epoch's training
        train_loss = train(train_loader=train_loader,
                           model=model,
                           optimizer=optimizer,
                           epoch=epoch,
                           logger=logger,
                           writer=writer)

        writer.add_scalar('epoch/train_loss', train_loss, epoch)
        writer.add_scalar('epoch/learning_rate', optimizer.lr, epoch)

        print('\nLearning rate: {}'.format(optimizer.lr))
        print('Step num: {}\n'.format(optimizer.step_num))

        # One epoch's validation
        valid_loss = valid(valid_loader=valid_loader,
                           model=model,
                           logger=logger)
        writer.add_scalar('epoch/valid_loss', valid_loss, epoch)

        # Check if there was an improvement
        is_best = valid_loss < best_loss
        best_loss = min(valid_loss, best_loss)
        if not is_best:
            epochs_since_improvement += 1
            print("\nEpochs since last improvement: %d\n" %
                  (epochs_since_improvement, ))
        else:
            epochs_since_improvement = 0

        # Save checkpoint
        save_checkpoint(epoch, epochs_since_improvement, model, optimizer,
                        best_loss, is_best)
Esempio n. 17
0
def main():
    global char2index
    global index2char
    global SOS_token
    global EOS_token
    global PAD_token

    parser = argparse.ArgumentParser(description='Speech hackathon Baseline')

    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help='batch size in training (default: 32)')
    parser.add_argument(
        '--workers',
        type=int,
        default=4,
        help='number of workers in dataset loader (default: 4)')
    parser.add_argument('--max_epochs',
                        type=int,
                        default=10,
                        help='number of max epochs in training (default: 10)')
    parser.add_argument('--lr',
                        type=float,
                        default=0.0001,
                        help='learning rate (default: 0.0001)')
    parser.add_argument('--teacher_forcing',
                        type=float,
                        default=0.5,
                        help='teacher forcing ratio in decoder (default: 0.5)')
    parser.add_argument('--max_len',
                        type=int,
                        default=WORD_MAXLEN,
                        help='maximum characters of sentence (default: 80)')
    parser.add_argument('--no_cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        help='random seed (default: 1)')
    parser.add_argument('--save_name',
                        type=str,
                        default='model',
                        help='the name of model in nsml or local')
    parser.add_argument('--mode', type=str, default='train')
    parser.add_argument("--pause", type=int, default=0)
    parser.add_argument(
        '--word',
        action='store_true',
        help='Train/Predict model using word based label (default: False)')
    parser.add_argument('--gen_label_index',
                        action='store_true',
                        help='Generate word label index map(default: False)')
    parser.add_argument('--iteration', type=str, help='Iteratiom')
    parser.add_argument('--premodel_session',
                        type=str,
                        help='Session name of premodel')

    # transformer model parameter
    parser.add_argument('--d_model',
                        type=int,
                        default=128,
                        help='transformer_d_model')
    parser.add_argument('--n_head',
                        type=int,
                        default=8,
                        help='transformer_n_head')
    parser.add_argument('--num_encoder_layers',
                        type=int,
                        default=4,
                        help='num_encoder_layers')
    parser.add_argument('--num_decoder_layers',
                        type=int,
                        default=4,
                        help='transformer_num_decoder_layers')
    parser.add_argument('--dim_feedforward',
                        type=int,
                        default=2048,
                        help='transformer_d_model')
    parser.add_argument('--dropout',
                        type=float,
                        default=0.1,
                        help='transformer_dropout')

    # transformer warmup parameter
    parser.add_argument('--warmup_multiplier',
                        type=int,
                        default=3,
                        help='transformer_warmup_multiplier')
    parser.add_argument('--warmup_epoch',
                        type=int,
                        default=10,
                        help='transformer_warmup_epoch')

    args = parser.parse_args()
    char_loader = CharLabelLoader()
    char_loader.load_char2index('./hackathon.labels')
    label_loader = char_loader
    if args.word:
        if args.gen_label_index:
            generate_word_label_index_file(char_loader, TRAIN_LABEL_CHAR_PATH)
            from subprocess import call
            call(f'cat {TRAIN_LABEL_CHAR_PATH}', shell=True)
        # ??? ??? ??? ??
        word_loader = CharLabelLoader()
        word_loader.load_char2index('./hackathon.pos.labels')
        label_loader = word_loader
        if os.path.exists(TRAIN_LABEL_CHAR_PATH):
            generate_word_label_file(char_loader, word_loader,
                                     TRAIN_LABEL_POS_PATH,
                                     TRAIN_LABEL_CHAR_PATH)
    char2index = label_loader.char2index
    index2char = label_loader.index2char
    SOS_token = char2index['<s>']
    EOS_token = char2index['</s>']
    PAD_token = char2index['_']
    random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    args.cuda = not args.no_cuda and torch.cuda.is_available()
    device = torch.device('cuda' if args.cuda else 'cpu')

    ############ model
    print("model: transformer")
    # model = Transformer(d_model= args.d_model, n_head= args.n_head, num_encoder_layers= args.num_encoder_layers, num_decoder_layers= args.num_decoder_layers,
    #                     dim_feedforward= args.dim_feedforward, dropout= args.dropout, vocab_size= len(char2index), sound_maxlen= SOUND_MAXLEN, word_maxlen= WORD_MAXLEN)

    encoder = Encoder(d_input=128,
                      n_layers=6,
                      n_head=4,
                      d_k=128,
                      d_v=128,
                      d_model=128,
                      d_inner=2048,
                      dropout=0.1,
                      pe_maxlen=SOUND_MAXLEN)
    decoder = Decoder(sos_id=SOS_token,
                      eos_id=EOS_token,
                      n_tgt_vocab=len(char2index),
                      d_word_vec=128,
                      n_layers=6,
                      n_head=4,
                      d_k=128,
                      d_v=128,
                      d_model=128,
                      d_inner=2048,
                      dropout=0.1,
                      tgt_emb_prj_weight_sharing=True,
                      pe_maxlen=SOUND_MAXLEN)
    model = Transformer(encoder, decoder)

    optimizer = TransformerOptimizer(
        torch.optim.Adam(model.parameters(),
                         lr=0.0004,
                         betas=(0.9, 0.98),
                         eps=1e-09))

    ############/

    for param in model.parameters():
        param.data.uniform_(-0.08, 0.08)

    model = nn.DataParallel(model).to(device)
    """
    optimizer = optim.Adam(model.module.parameters(), lr=args.lr)

    scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.max_epochs)
    scheduler_warmup = GradualWarmupScheduler(optimizer, multiplier=args.warmup_multiplier, total_epoch=args.warmup_epoch, after_scheduler=scheduler_cosine)
    
    
    criterion = nn.CrossEntropyLoss(reduction='sum', ignore_index=PAD_token).to(device)
    """

    bind_model(model, optimizer)

    if args.pause == 1:
        nsml.paused(scope=locals())

    if args.mode != "train":
        return

    data_list = os.path.join(DATASET_PATH, 'train_data', 'data_list.csv')
    wav_paths = list()
    script_paths = list()

    with open(data_list, 'r') as f:
        for line in f:
            # line: "aaa.wav,aaa.label"

            wav_path, script_path = line.strip().split(',')
            wav_paths.append(os.path.join(DATASET_PATH, 'train_data',
                                          wav_path))
            script_paths.append(
                os.path.join(DATASET_PATH, 'train_data', script_path))

    best_loss = 1e10
    begin_epoch = 0

    # load all target scripts for reducing disk i/o
    # target_path = os.path.join(DATASET_PATH, 'train_label')
    target_path = TRAIN_LABEL_CHAR_PATH
    if args.word:
        target_path = TRAIN_LABEL_POS_PATH
    load_targets(target_path)

    train_batch_num, train_dataset_list, valid_dataset = split_dataset(
        args, wav_paths, script_paths, valid_ratio=0.05)

    if args.iteration:
        if args.premodel_session:
            nsml.load(args.iteration, session=args.premodel_session)
            logger.info(f'Load {args.premodel_session} {args.iteration}')
        else:
            nsml.load(args.iteration)
            logger.info(f'Load {args.iteration}')
    logger.info('start')

    train_begin = time.time()

    for epoch in range(begin_epoch, args.max_epochs):
        # learning rate scheduler

        train_queue = queue.Queue(args.workers * 2)

        train_loader = MultiLoader(train_dataset_list, train_queue,
                                   args.batch_size, args.workers)
        train_loader.start()

        train_loss, train_cer = train(model, train_batch_num, train_queue,
                                      optimizer, device, train_begin,
                                      args.workers, 10, args.teacher_forcing)
        logger.info('Epoch %d (Training) Loss %0.4f CER %0.4f' %
                    (epoch, train_loss, train_cer))

        train_loader.join()

        print("~~~~~~~~~~~~")

        if epoch == 10 or (epoch > 48 and epoch % 10 == 9):
            valid_queue = queue.Queue(args.workers * 2)
            valid_loader = BaseDataLoader(valid_dataset, valid_queue,
                                          args.batch_size, 0)
            valid_loader.start()

            eval_loss, eval_cer = evaluate(model, valid_loader, valid_queue,
                                           device, args.max_len,
                                           args.batch_size)
            logger.info('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' %
                        (epoch, eval_loss, eval_cer))

            valid_loader.join()

            nsml.report(False,
                        step=epoch,
                        train_epoch__loss=train_loss,
                        train_epoch__cer=train_cer,
                        eval__loss=eval_loss,
                        eval__cer=eval_cer)

            best_model = (eval_loss < best_loss)
            nsml.save(args.save_name)

            if best_model:
                nsml.save('best')
                best_loss = eval_loss
Esempio n. 18
0
def train_net(args):
    # 为了保证程序执行结果一致, 给随机化设定种子
    torch.manual_seed(7)
    np.random.seed(7)
    checkpoint = args.checkpoint

    start_epoch = 0
    writer = SummaryWriter()

    if checkpoint is None:
        # model
        encoder = Encoder(Config.vocab_size, args.n_layers_enc, args.n_head,
                          args.d_k, args.d_v, args.d_model, args.d_inner,
                          dropout=args.dropout, pe_maxlen=args.pe_maxlen)

        decoder = Decoder(Config.sos_id, Config.eos_id, Config.vocab_size,
                          args.d_word_vec, args.n_layers_dec, args.n_head,
                          args.d_k, args.d_v, args.d_model, args.d_inner,
                          dropout=args.dropout,
                          tgt_emb_prj_weight_sharing=args.tgt_emb_prj_weight_sharing,
                          pe_maxlen=args.pe_maxlen)

        model = Transformer(encoder, decoder)

        # optimizer
        optimizer = TransformerOptimizer(
            torch.optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-09))

    else:
        checkpoint = torch.load(checkpoint)
        start_epoch = checkpoint['epoch'] + 1
        model = checkpoint['model']
        optimizer = checkpoint['optimizer']

    # Move to GPU, if available
    model = model.to(Config.device)

    # Custom dataloaders  数据的加载 注意这里指定了一个参数collate_fn代表的数据需要padding
    train_dataset = TranslateDataset()

    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, collate_fn=pad_collate,
                                               shuffle=True, num_workers=args.num_workers)

    # Epochs
    Loss_list = []
    for epoch in range(start_epoch, args.epochs):
        # One epoch's training
        train_loss = train(train_loader=train_loader,
                           model=model,
                           optimizer=optimizer,
                           epoch=epoch,
                           logger=logger,
                           writer=writer)

        l = str(train_loss)
        Loss_list.append(l)

        l_temp = l + '\n'
        with open('loss_epoch.txt', 'a+') as f:
            f.write(l_temp)

        writer.add_scalar('epoch/train_loss', train_loss, epoch)
        writer.add_scalar('epoch/learning_rate', optimizer.lr, epoch)

        print('\nLearning rate: {}'.format(optimizer.lr))
        print('Step num: {}\n'.format(optimizer.step_num))

        # Save checkpoint
        save_checkpoint(epoch, model, optimizer, train_loss)
    with open('loss.txt', 'w') as f:
        f.write('\n'.join(Loss_list))
Esempio n. 19
0
from transformer.optimizer import TransformerOptimizer
from transformer.transformer import Transformer
from utils import parse_args

if __name__ == '__main__':
    k = 0.2
    warmup_steps = 4000
    init_lr = 512**(-0.5)

    args = parse_args()

    encoder = Encoder(args.d_input * args.LFR_m,
                      args.n_layers_enc,
                      args.n_head,
                      args.d_k,
                      args.d_v,
                      args.d_model,
                      args.d_inner,
                      dropout=args.dropout,
                      pe_maxlen=args.pe_maxlen)
    decoder = Decoder(
        sos_id,
        eos_id,
        vocab_size,
        args.d_word_vec,
        args.n_layers_dec,
        args.n_head,
        args.d_k,
        args.d_v,
        args.d_model,
        args.d_inner,
Esempio n. 20
0
    os.makedirs(LOG_PATH)
eer_path = LOG_PATH / f_name
res_path = LOG_PATH / r_name
"""
train
"""
# flat_shape = hp["numcep"] * hp["nb_time"]
d_input = hp["numcep"]
label_shape = len(train_speaker_list)

# model
d_m = hp["d_m"]
encoder = Encoder(d_input=d_input,
                  n_layers=2,
                  d_k=d_m,
                  d_v=d_m,
                  d_m=d_m,
                  d_ff=hp["d_ff"],
                  dropout=0.1).to(device)
pooling = SelfAttentionPooling(d_m, dropout=0.1).to(device)
model = Transformer(encoder, pooling, d_m, label_shape, dropout=0.2).to(device)

opt = torch.optim.Adam(model.parameters(),
                       lr=hp["lr"],
                       weight_decay=hp["weight_decay"])

loss_func = torch.nn.CrossEntropyLoss()

best_eer = 99.
if hp["comet"]:
    with experiment.train():
Esempio n. 21
0
    def __init__(self, params: dict):
        """
        Instantiate the ``Transformer`` class.

        :param params: Dict containing the set of parameters for the entire model\
         (e.g ``EncoderLayer``, ``DecoderLayer`` etc.) broken down in relevant sections, e.g.:

            params = {
                'd_model': 512,
                'src_vocab_size': 27000,
                'tgt_vocab_size': 27000,

                'N': 6,
                'dropout': 0.1,

                'attention': {'n_head': 8,
                              'd_k': 64,
                              'd_v': 64,
                              'dropout': 0.1},

                'feed-forward': {'d_ff': 2048,
                                 'dropout': 0.1},
            }

        """
        # call base constructor
        super(Transformer, self).__init__()

        # Save params for Checkpoint
        self._params = params

        # instantiate Encoder layer
        enc_layer = EncoderLayer(
            size=params['d_model'],
            self_attention=MultiHeadAttention(
                n_head=params['attention']['n_head'],
                d_model=params['d_model'],
                d_k=params['attention']['d_k'],
                d_v=params['attention']['d_v'],
                dropout=params['attention']['dropout']),
            feed_forward=PositionwiseFeedForward(
                d_model=params['d_model'],
                d_ff=params['feed-forward']['d_ff'],
                dropout=params['feed-forward']['dropout']),
            dropout=params['dropout'])

        # instantiate Encoder
        self.encoder = Encoder(layer=enc_layer, n_layers=params['N'])

        # instantiate Decoder layer
        decoder_layer = DecoderLayer(
            size=params['d_model'],
            self_attn=MultiHeadAttention(
                n_head=params['attention']['n_head'],
                d_model=params['d_model'],
                d_k=params['attention']['d_k'],
                d_v=params['attention']['d_v'],
                dropout=params['attention']['dropout']),
            memory_attn=MultiHeadAttention(
                n_head=params['attention']['n_head'],
                d_model=params['d_model'],
                d_k=params['attention']['d_k'],
                d_v=params['attention']['d_v'],
                dropout=params['attention']['dropout']),
            feed_forward=PositionwiseFeedForward(
                d_model=params['d_model'],
                d_ff=params['feed-forward']['d_ff'],
                dropout=params['feed-forward']['dropout']),
            dropout=params['dropout'])

        # instantiate Decoder
        self.decoder = Decoder(layer=decoder_layer, N=params['N'])

        pos_encoding = PositionalEncoding(d_model=params['d_model'],
                                          dropout=params['dropout'])

        self.src_embeddings = nn.Sequential(
            Embeddings(d_model=params['d_model'],
                       vocab_size=params['src_vocab_size']), pos_encoding)

        self.trg_embeddings = nn.Sequential(
            Embeddings(d_model=params['d_model'],
                       vocab_size=params['tgt_vocab_size']), pos_encoding)

        self.classifier = OutputClassifier(d_model=params['d_model'],
                                           vocab=params['tgt_vocab_size'])

        # Initialize parameters with Glorot / fan_avg.
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
Esempio n. 22
0
    def __init__(self,
                 n_src_vocab,
                 n_tgt_vocab,
                 len_max_seq_enc,
                 len_max_seq_dec,
                 d_word_vec=512,
                 d_model=512,
                 d_inner=2048,
                 n_layers=6,
                 n_head=8,
                 d_k=64,
                 d_v=64,
                 dropout=0.1,
                 tgt_emb_prj_weight_sharing=True,
                 emb_src_tgt_weight_sharing=True,
                 pretrained_embeddings=None):

        super().__init__()

        self.encoder = Encoder(n_src_vocab=n_src_vocab,
                               len_max_seq=len_max_seq_enc,
                               d_word_vec=d_word_vec,
                               d_model=d_model,
                               d_inner=d_inner,
                               n_layers=n_layers,
                               n_head=n_head,
                               d_k=d_k,
                               d_v=d_v,
                               dropout=dropout,
                               pretrained_embeddings=pretrained_embeddings)

        self.decoder = Decoder(n_tgt_vocab=n_tgt_vocab,
                               len_max_seq=len_max_seq_dec,
                               d_word_vec=d_word_vec,
                               d_model=d_model,
                               d_inner=d_inner,
                               n_layers=n_layers,
                               n_head=n_head,
                               d_k=d_k,
                               d_v=d_v,
                               dropout=dropout,
                               pretrained_embeddings=pretrained_embeddings)

        self.tgt_word_prj = nn.Linear(d_model, n_tgt_vocab, bias=False)
        nn.init.xavier_normal_(self.tgt_word_prj.weight)

        assert d_model == d_word_vec, \
            'To facilitate the residual connections, \
             the dimensions of all module outputs shall be the same.'

        if tgt_emb_prj_weight_sharing:
            # Share the weight matrix between target word embedding & the final logit dense layer
            self.tgt_word_prj.weight = self.decoder.tgt_word_emb.weight
            self.x_logit_scale = (d_model**-0.5)
        else:
            self.x_logit_scale = 1.

        if emb_src_tgt_weight_sharing:
            # Share the weight matrix between source & target word embeddings
            assert n_src_vocab == n_tgt_vocab, \
                "To share word embedding table, the vocabulary size of src/tgt shall be the same."
            self.encoder.src_word_emb.weight = self.decoder.tgt_word_emb.weight