Ejemplo n.º 1
0
def test_builtin_conf(app, status, warning):
    warnings = warning.getvalue()
    assert_in('master_doc', warnings,
        'override on builtin "master_doc" should raise a type warning')
    assert_not_in('language', warnings, 'explicitly permitted '
        'override on builtin "language" should NOT raise a type warning')
    assert_not_in('primary_domain', warnings, 'override to None on builtin '
        '"primary_domain" should NOT raise a type warning')
Ejemplo n.º 2
0
    def __init__(self, dataset, datapath, indexer, preprocessor):
        dataset_media, dataset_regime = dataset.split('.')
        util.assert_in(dataset_media, ['yelp', 'twitter'])
        self.dataset_media = dataset_media
        self.dataset_regime = dataset_regime

        self.datapath = datapath
        self.pp = preprocessor
        if (self.pp is None):
            self.pp = Preprocessor()
        self.indexer = indexer
Ejemplo n.º 3
0
    def __init__(self,
                 dimensions=200,
                 finetune=False,
                 vocab_size=1000,
                 pooling='max',
                 activation='relu',
                 kernel_sizes=(1, 2, 3),
                 filters=5,
                 dropout_rate=0.0,
                 lr=1e-3,
                 beta_1=0.9,
                 beta_2=0.999,
                 epsilon=1e-08,
                 weight_decay=0.0,
                 embeddings_matrix=None):
        """

        :param dimensions: int: dimension of each vector
        :param finetune: bool : weather or not to finetune word emdeddings
        :param vocab_size: int: size of the vocabulary, emdeddings layer will be this big

        :param pooling: ['average', 'logsumexp']: pooling operation for word vectors in a document
        :param activation: str: activation for convolutional stack
        :param kernel_sizes: tuple: convolve using unigrams / bigrams / trigrams
        :param filters: int : number of filters for convolutional layer
        :param dropout_rate: float: probability of dropout common across all the dropout layers

        :param lr: learning rate for adam optimiser
        :param beta_1: parameter for adam optimiser
        :param beta_2: parameter for adam optimiser
        :param epsilon: parameter for adam optimiser
        :param weight_decay: parameter for adam optimiser (l2 regularization weight, kernel_l2_regularization)

        :param embeddings_matrix: None or numpy.ndarray : embeddings_matrix to be used for the model
        """

        # Initialize torch model
        super(Net, self).__init__()

        # Validate arguments
        assert (type(dimensions) == int), type(dimensions)
        assert (type(finetune) == bool), type(finetune)
        assert (type(vocab_size) == int), type(vocab_size)

        util.assert_in(pooling, ['max', 'average', 'logsumexp'])
        assert (all(map(lambda x: isinstance(x, int),
                        kernel_sizes))), '{} should all be ints'.format(
                            str(kernel_sizes))
        assert (isinstance(filters, int)), type(filters)
        assert isinstance(dropout_rate, float)

        assert isinstance(lr, float)
        assert isinstance(beta_1, float)
        assert isinstance(beta_2, float)
        assert isinstance(epsilon, float)
        assert isinstance(weight_decay, float)

        if isinstance(embeddings_matrix, np.ndarray):
            assert (
                vocab_size, dimensions
            ) == embeddings_matrix.shape, "mismatched dimensions of embeddings_matrix"
        elif embeddings_matrix is None:
            pass
        else:
            raise TypeError("Unsupported embeddings_matrix type: " +
                            type(embeddings_matrix))

        # save hyperparameters
        self.hyperparameters = {
            k: v
            for k, v in locals().iteritems()
            if not k in ('embeddings_matrix', 'self')
        }
        logger.debug(self.to_json(indent=None))

        # our layers

        # Pass the input through embeddings
        # https://discuss.pytorch.org/t/can-we-use-pre-trained-word-embeddings-for-weight-initialization-in-nn-embedding/1222/12
        self.embeddings = nn.Embedding(vocab_size, dimensions)
        self.embeddings.training = finetune
        if not embeddings_matrix is None:
            self.embeddings.weight.data.copy_(
                torch.FloatTensor(embeddings_matrix))

        # add droupout layer
        # self.dropout = nn.Dropout(p=dropout_rate)

        # get the convolutional stack
        self.pad1_layers = []
        self.conv1_layers = []
        self.drop1_layers = []
        conv_stack = get_conv_stack(dimensions, filters, kernel_sizes,
                                    dropout_rate)

        for i, (pad, conv, drop) in enumerate(conv_stack):
            setattr(self, 'pad1_' + str(i), pad)
            self.pad1_layers.append('pad1_' + str(i))
            setattr(self, 'conv1_' + str(i), conv)
            self.conv1_layers.append('conv1_' + str(i))
            setattr(self, 'drop1_' + str(i), drop)
            self.drop1_layers.append('drop1_' + str(i))

        self.conv1_stack_pooling = pooling
        self.conv1_stack_activation = activation

        self.fc = nn.Linear(len(kernel_sizes) * filters, 1)