Beispiel #1
0
    def __init__(self,
                 input_dim,
                 in_channel,
                 channels,
                 kernel_sizes,
                 dropout,
                 bottleneck_dim=0,
                 param_init=0.1):

        super(GatedConvEncoder, self).__init__()

        (channels, kernel_sizes, _,
         _), _ = parse_cnn_config(channels, kernel_sizes, '', '')

        self.in_channel = in_channel
        assert input_dim % in_channel == 0
        self.input_freq = input_dim // in_channel
        self.bridge = None

        assert len(channels) > 0
        assert len(channels) == len(kernel_sizes)

        layers = OrderedDict()
        for lth in range(len(channels)):
            layers['conv%d' % lth] = ConvGLUBlock(kernel_sizes[lth][0],
                                                  input_dim,
                                                  channels[lth],
                                                  weight_norm=True,
                                                  dropout=0.2)
            input_dim = channels[lth]

        # weight normalization + GLU for the last fully-connected layer
        self.fc_glu = nn.utils.weight_norm(nn.Linear(input_dim, input_dim * 2),
                                           name='weight',
                                           dim=0)

        self._odim = int(input_dim)

        if bottleneck_dim > 0:
            self.bridge = nn.Linear(self._odim, bottleneck_dim)
            self._odim = bottleneck_dim

        self.layers = nn.Sequential(layers)

        self._factor = 1

        self.reset_parameters(param_init)
Beispiel #2
0
    def __init__(self, args, save_path=None):

        super(LMBase, self).__init__()
        logger.info(self.__class__.__name__)

        self.lm_type = args.lm_type
        self.save_path = save_path

        self.emb_dim = args.emb_dim
        self.n_units = args.n_units
        self.n_layers = args.n_layers
        self.lsm_prob = args.lsm_prob

        self.vocab = args.vocab
        self.eos = 2
        self.pad = 3
        # NOTE: reserved in advance

        # for cache
        self.cache_theta = 0.2  # smoothing parameter
        self.cache_lambda = 0.2  # cache weight
        self.cache_ids = []
        self.cache_keys = []
        self.cache_attn = []
        self.embed_cache = None

        self.embed = nn.Embedding(self.vocab,
                                  args.emb_dim,
                                  padding_idx=self.pad)
        self.dropout_embed = nn.Dropout(p=args.dropout_in)

        model_size = args.lm_type.replace('gated_conv_', '')

        blocks = OrderedDict()
        dropout = args.dropout_hidden
        if model_size == 'custom':
            blocks['conv1'] = ConvGLUBlock(args.kernel_size,
                                           args.emb_dim,
                                           args.n_units,
                                           bottlececk_dim=args.n_projs,
                                           dropout=dropout)
            for lth in range(args.n_layers - 1):
                blocks['conv%d' % (lth + 2)] = ConvGLUBlock(
                    args.kernel_size,
                    args.n_units,
                    args.n_units,
                    bottlececk_dim=args.n_projs,
                    dropout=dropout)
            last_dim = args.n_units

        elif model_size == '8':
            blocks['conv1'] = ConvGLUBlock(4,
                                           args.emb_dim,
                                           900,
                                           dropout=dropout)
            for i in range(1, 8, 1):
                blocks['conv2-%d' % i] = ConvGLUBlock(4,
                                                      900,
                                                      900,
                                                      dropout=dropout)
            last_dim = 900

        elif model_size == '8B':
            blocks['conv1'] = ConvGLUBlock(1,
                                           args.emb_dim,
                                           512,
                                           dropout=dropout)
            for i in range(1, 4, 1):
                blocks['conv2-%d' % i] = ConvGLUBlock(5,
                                                      512,
                                                      512,
                                                      bottlececk_dim=128,
                                                      dropout=dropout)
            for i in range(1, 4, 1):
                blocks['conv3-%d' % i] = ConvGLUBlock(5,
                                                      512,
                                                      512,
                                                      bottlececk_dim=256,
                                                      dropout=dropout)
            blocks['conv4'] = ConvGLUBlock(1,
                                           512,
                                           2048,
                                           bottlececk_dim=1024,
                                           dropout=dropout)
            last_dim = 2048

        elif model_size == '9':
            blocks['conv1'] = ConvGLUBlock(4,
                                           args.emb_dim,
                                           807,
                                           dropout=dropout)
            for i in range(1, 4, 1):
                blocks['conv2-%d-1' % i] = ConvGLUBlock(4,
                                                        807,
                                                        807,
                                                        dropout=dropout)
                blocks['conv2-%d-2' % i] = ConvGLUBlock(4,
                                                        807,
                                                        807,
                                                        dropout=dropout)
            last_dim = 807

        elif model_size == '13':
            blocks['conv1'] = ConvGLUBlock(4,
                                           args.emb_dim,
                                           1268,
                                           dropout=dropout)
            for i in range(1, 13, 1):
                blocks['conv2-%d' % i] = ConvGLUBlock(4,
                                                      1268,
                                                      1268,
                                                      dropout=dropout)
            last_dim = 1268

        elif model_size == '14':
            for i in range(1, 4, 1):
                blocks['conv1-%d' % i] = ConvGLUBlock(
                    6, args.emb_dim if i == 1 else 850, 850, dropout=dropout)
            blocks['conv2'] = ConvGLUBlock(1, 850, 850, dropout=dropout)
            for i in range(1, 5, 1):
                blocks['conv3-%d' % i] = ConvGLUBlock(5,
                                                      850,
                                                      850,
                                                      dropout=dropout)
            blocks['conv4'] = ConvGLUBlock(1, 850, 850, dropout=dropout)
            for i in range(1, 4, 1):
                blocks['conv5-%d' % i] = ConvGLUBlock(4,
                                                      850,
                                                      850,
                                                      dropout=dropout)
            blocks['conv6'] = ConvGLUBlock(4, 850, 1024, dropout=dropout)
            blocks['conv7'] = ConvGLUBlock(4, 1024, 2048, dropout=dropout)
            last_dim = 2048

        elif model_size == '14B':
            blocks['conv1'] = ConvGLUBlock(5,
                                           args.emb_dim,
                                           512,
                                           dropout=dropout)
            for i in range(1, 4, 1):
                blocks['conv2-%d' % i] = ConvGLUBlock(5,
                                                      512,
                                                      512,
                                                      bottlececk_dim=128,
                                                      dropout=dropout)
            for i in range(1, 4, 1):
                blocks['conv3-%d' % i] = ConvGLUBlock(5,
                                                      512 if i == 1 else 1024,
                                                      1024,
                                                      bottlececk_dim=512,
                                                      dropout=dropout)
            for i in range(1, 7, 1):
                blocks['conv4-%d' % i] = ConvGLUBlock(5,
                                                      1024 if i == 1 else 2048,
                                                      2048,
                                                      bottlececk_dim=1024,
                                                      dropout=dropout)
            blocks['conv5'] = ConvGLUBlock(5,
                                           2048,
                                           4096,
                                           bottlececk_dim=1024,
                                           dropout=dropout)
            last_dim = 4096

        else:
            raise NotImplementedError(model_size)

        self.blocks = nn.Sequential(blocks)

        if args.adaptive_softmax:
            self.adaptive_softmax = nn.AdaptiveLogSoftmaxWithLoss(
                last_dim,
                self.vocab,
                # cutoffs=[self.vocab // 10, 3 * self.vocab // 10],
                cutoffs=[self.vocab // 25, self.vocab // 5],
                div_value=4.0)
            self.output = None
        else:
            self.adaptive_softmax = None
            self.output = nn.Linear(last_dim, self.vocab)
            if args.tie_embedding:
                if args.n_units != args.emb_dim:
                    raise ValueError(
                        'When using the tied flag, n_units must be equal to emb_dim.'
                    )
                self.output.weight = self.embed.weight

        self.reset_parameters(args.param_init)