def initialize_data_and_model(config):
    c = config
    vocab = None
    if c['vocab_path']:
        vocab = Vocabulary(
            os.path.join(fuel.config.data_path[0], c['vocab_path']))
    data = ExtractiveQAData(path=c['data_path'],
                            vocab=vocab,
                            layout=c['layout'])
    # TODO: fix me, I'm so ugly (I mean the access of a private attribute)
    if c['dict_path']:
        dict_vocab = data.vocab
        if c['dict_vocab_path']:
            dict_vocab = Vocabulary(
                os.path.join(fuel.config.data_path[0], c['dict_vocab_path']))
        data._retrieval = Retrieval(
            data.vocab,
            Dictionary(os.path.join(fuel.config.data_path[0], c['dict_path'])),
            max_def_length=c['max_def_length'],
            with_too_long_defs=c['with_too_long_defs'],
            max_def_per_word=c['max_def_per_word'],
            with_too_many_defs=c['with_too_many_defs'],
            # This should fix --exclude_top_k
            vocab_def=dict_vocab)
    logger.debug("Data loaded")
    qam = ExtractiveQAModel(
        c['dim'],
        c['emb_dim'],
        c['readout_dims'],
        c['num_input_words'],
        c['def_num_input_words'],
        data.vocab,
        coattention=c['coattention'],
        use_definitions=bool(c['dict_path']),
        def_word_gating=c['def_word_gating'],
        compose_type=c['compose_type'],
        reuse_word_embeddings=c['reuse_word_embeddings'],
        bidir_encoder=c['bidir_encoder'],
        random_unk=c['random_unk'],
        def_reader=c['def_reader'],
        weights_init=(GlorotUniform() if not c['init_width'] else Uniform(
            width=c['init_width'])),
        recurrent_weights_init=(GlorotUniform() if not c['rec_init_width'] else
                                Uniform(width=c['rec_init_width'])),
        biases_init=Constant(0.))
    qam.initialize()
    logger.debug("Model created")
    if c['embedding_path']:
        qam.set_embeddings(
            numpy.load(
                os.path.join(fuel.config.data_path[0], c['embedding_path'])))
        logger.debug("Embeddings loaded")
    return data, qam
Beispiel #2
0
    def __init__(
            self,
            dim,
            emb_dim,
            vocab,
            def_emb_translate_dim=-1,
            def_dim=-1,
            encoder='bilstm',
            bn=True,
            def_reader=None,
            def_combiner=None,
            dropout=0.5,
            num_input_words=-1,
            # Others
            **kwargs):

        self._dropout = dropout
        self._vocab = vocab
        self._emb_dim = emb_dim
        self._def_reader = def_reader
        self._def_combiner = def_combiner

        if encoder != 'bilstm':
            raise NotImplementedError()

        if def_emb_translate_dim < 0:
            self.def_emb_translate_dim = emb_dim
        else:
            self.def_emb_translate_dim = def_emb_translate_dim

        if def_dim < 0:
            self._def_dim = emb_dim
        else:
            self._def_dim = def_dim

        if num_input_words > 0:
            logger.info("Restricting vocab to " + str(num_input_words))
            self._num_input_words = num_input_words
        else:
            self._num_input_words = vocab.size()

        children = []

        if self.def_emb_translate_dim != self._emb_dim:
            self._translate_pre_def = Linear(input_dim=emb_dim,
                                             output_dim=def_emb_translate_dim)
            children.append(self._translate_pre_def)
        else:
            self._translate_pre_def = None

        ## Embedding
        self._lookup = LookupTable(self._num_input_words,
                                   emb_dim,
                                   weights_init=GlorotUniform())
        children.append(self._lookup)

        if def_reader:
            self._final_emb_dim = self._def_dim
            self._def_reader = def_reader
            self._def_combiner = def_combiner
            children.extend([self._def_reader, self._def_combiner])
        else:
            self._final_emb_dim = self._emb_dim

        ## BiLSTM
        self._hyp_bidir_fork = Linear(
            self._def_dim if def_reader else self._emb_dim,
            4 * dim,
            name='hyp_bidir_fork')
        self._hyp_bidir = Bidirectional(LSTM(dim), name='hyp_bidir')
        self._prem_bidir_fork = Linear(
            self._def_dim if def_reader else self._emb_dim,
            4 * dim,
            name='prem_bidir_fork')
        self._prem_bidir = Bidirectional(LSTM(dim), name='prem_bidir')
        children.extend([self._hyp_bidir_fork, self._hyp_bidir])
        children.extend([self._prem_bidir, self._prem_bidir_fork])

        ## BiLSTM no. 2 (encoded attentioned embeddings)
        self._hyp_bidir_fork2 = Linear(8 * dim,
                                       4 * dim,
                                       name='hyp_bidir_fork2')
        self._hyp_bidir2 = Bidirectional(LSTM(dim), name='hyp_bidir2')
        self._prem_bidir_fork2 = Linear(8 * dim,
                                        4 * dim,
                                        name='prem_bidir_fork2')
        self._prem_bidir2 = Bidirectional(LSTM(dim), name='prem_bidir2')
        children.extend([self._hyp_bidir_fork2, self._hyp_bidir2])
        children.extend([self._prem_bidir2, self._prem_bidir_fork2])

        self._rnns = [
            self._prem_bidir2, self._hyp_bidir2, self._prem_bidir,
            self._hyp_bidir
        ]

        ## MLP
        if bn:
            self._mlp = BatchNormalizedMLP([Tanh()], [8 * dim, dim],
                                           conserve_memory=False,
                                           name="mlp")
            self._pred = BatchNormalizedMLP([Softmax()], [dim, 3],
                                            conserve_memory=False,
                                            name="pred_mlp")
        else:
            self._mlp = MLP([Tanh()], [8 * dim, dim], name="mlp")
            self._pred = MLP([Softmax()], [dim, 3], name="pred_mlp")

        children.append(self._mlp)
        children.append(self._pred)

        ## Softmax
        self._ndim_softmax = NDimensionalSoftmax()
        children.append(self._ndim_softmax)

        super(ESIM, self).__init__(children=children, **kwargs)
Beispiel #3
0
def _initialize_simple_model_and_data(c):

    if c['vocab']:
        vocab = Vocabulary(c['vocab'])
    else:
        vocab = None
    # Load data
    data = SNLIData(path=c['data_path'], layout=c['layout'], vocab=vocab)

    if vocab is None:
        vocab = data.vocab

    if c.get('vocab_text', ''):
        vocab_text = Vocabulary(c['vocab_text'])
    else:
        vocab_text = vocab

    # Dict
    if c['dict_path']:
        dict = Dictionary(c['dict_path'])
        logging.info("Loaded dict with {} entries".format(dict.num_entries()))

        if len(c['vocab_def']):
            retrieval_vocab = Vocabulary(c['vocab_def'])
        else:
            retrieval_vocab = data.vocab

        retrieval = Retrieval(vocab_text=vocab_text,
                              vocab_def=retrieval_vocab,
                              dictionary=dict,
                              max_def_length=c['max_def_length'],
                              with_too_long_defs=c['with_too_long_defs'],
                              exclude_top_k=c['exclude_top_k'],
                              max_def_per_word=c['max_def_per_word'])

        data.set_retrieval(retrieval)
    else:
        retrieval = None
        dict = None
        retrieval_vocab = None

    def_emb_dim = c.get('def_emb_dim',
                        0) if c.get('def_emb_dim', 0) > 0 else c['emb_dim']
    def_emb_translate_dim = c.get(
        'def_emb_translate_dim',
        0) if c.get('def_emb_translate_dim', 0) > 0 else def_emb_dim

    # Initialize
    simple = NLISimple(
        # Baseline arguments
        emb_dim=c['emb_dim'],
        vocab=data.vocab,
        encoder=c['encoder'],
        dropout=c['dropout'],
        num_input_words=c['num_input_words'],
        mlp_dim=c['mlp_dim'],

        # Dict lookup kwargs (will get refactored)
        translate_dim=c['translate_dim'],
        retrieval=retrieval,
        compose_type=c['compose_type'],
        reader_type=c['reader_type'],
        disregard_word_embeddings=c['disregard_word_embeddings'],
        def_vocab=retrieval_vocab,
        def_emb_dim=c['def_emb_dim'],
        combiner_dropout=c['combiner_dropout'],
        share_def_lookup=c['share_def_lookup'],
        combiner_dropout_type=c['combiner_dropout_type'],
        combiner_bn=c['combiner_bn'],
        combiner_gating=c['combiner_gating'],
        combiner_shortcut=c['combiner_shortcut'],
        combiner_reader_translate=c['combiner_reader_translate'],
        def_dim=c['def_dim'],
        num_input_def_words=c['num_input_def_words'],
        def_emb_translate_dim=def_emb_translate_dim,

        # Init
        weights_init=GlorotUniform(),
        biases_init=Constant(0.0))
    simple.push_initialization_config()
    if c['encoder'] == 'rnn':
        simple._rnn_encoder.weights_init = Uniform(std=0.1)
    simple.initialize()

    if c.get('embedding_def_path', ''):
        embeddings = np.load(c['embedding_def_path'])
        simple.set_def_embeddings(embeddings.astype(theano.config.floatX))

    if c['embedding_path']:
        embeddings = np.load(c['embedding_path'])
        simple.set_embeddings(embeddings.astype(theano.config.floatX))

    return simple, data, dict, retrieval, vocab
Beispiel #4
0
def _initialize_esim_model_and_data(c):

    if c['vocab']:
        vocab = Vocabulary(c['vocab'])
    else:
        vocab = None

    # Load data
    data = SNLIData(path=c['data_path'], layout=c['layout'], vocab=vocab)

    if vocab is None:
        vocab = data.vocab

    if c.get('vocab_text', ''):
        vocab_text = Vocabulary(c['vocab_text'])
    else:
        vocab_text = vocab

    # def_emb_dim defaults to emb_dim
    # def_emb_translate_dim default to def_emb_dim
    def_emb_dim = c.get('def_emb_dim',
                        0) if c.get('def_emb_dim', 0) > 0 else c['emb_dim']
    def_emb_translate_dim = c.get(
        'def_emb_translate_dim',
        0) if c.get('def_emb_translate_dim', 0) > 0 else def_emb_dim

    # Dict
    if c['dict_path']:
        dict = Dictionary(c['dict_path'])
        logging.info("Loaded dict with {} entries".format(dict.num_entries()))

        if len(c['vocab_def']):
            retrieval_vocab = Vocabulary(c['vocab_def'])
        else:
            retrieval_vocab = data.vocab

        retrieval = Retrieval(vocab_text=vocab_text,
                              vocab_def=retrieval_vocab,
                              dictionary=dict,
                              max_def_length=c['max_def_length'],
                              with_too_long_defs=c['with_too_long_defs'],
                              exclude_top_k=c['exclude_top_k'],
                              max_def_per_word=c['max_def_per_word'])

        data.set_retrieval(retrieval)

        num_input_def_words = c['num_input_def_words'] if c[
            'num_input_def_words'] > 0 else c['num_input_words']

        # TODO: Refactor lookup passing to reader. Very incoventient ATM
        if c['reader_type'] == "rnn":
            def_reader = LSTMReadDefinitions(
                num_input_words=num_input_def_words,
                weights_init=Uniform(width=0.1),
                biases_init=Constant(0.),
                dim=c['def_dim'],
                emb_dim=def_emb_dim,
                vocab=vocab,
                lookup=None)
        elif c['reader_type'] == "mean":
            def_reader = MeanPoolReadDefinitions(
                num_input_words=num_input_def_words,
                translate=c['combiner_reader_translate'],
                vocab=vocab,
                weights_init=Uniform(width=0.1),
                lookup=None,
                dim=def_emb_translate_dim,
                biases_init=Constant(0.),
                emb_dim=def_emb_dim)
        else:
            raise NotImplementedError()

        def_combiner = MeanPoolCombiner(
            dim=c['def_dim'],
            emb_dim=def_emb_translate_dim,
            dropout=c['combiner_dropout'],
            dropout_type=c['combiner_dropout_type'],
            def_word_gating=c['combiner_gating'],
            shortcut_unk_and_excluded=c['combiner_shortcut'],
            num_input_words=num_input_def_words,
            exclude_top_k=c['exclude_top_k'],
            vocab=vocab,
            compose_type=c['compose_type'],
            weights_init=Uniform(width=0.1),
            biases_init=Constant(0.))

    else:
        retrieval = None
        dict = None
        def_combiner = None
        def_reader = None

    # Initialize

    simple = ESIM(
        # Baseline arguments
        emb_dim=c['emb_dim'],
        vocab=data.vocab,
        encoder=c['encoder'],
        dropout=c['dropout'],
        def_emb_translate_dim=def_emb_translate_dim,
        num_input_words=c['num_input_words'],
        def_dim=c['def_dim'],
        dim=c['dim'],
        bn=c.get('bn', True),
        def_combiner=def_combiner,
        def_reader=def_reader,

        # Init
        weights_init=GlorotUniform(),
        biases_init=Constant(0.0))
    simple.push_initialization_config()
    # TODO: Not sure anymore why we do that
    if c['encoder'] == 'bilstm':
        for enc in simple._rnns:
            enc.weights_init = Uniform(std=0.1)
    simple.initialize()

    if c['embedding_path']:
        embeddings = np.load(c['embedding_path'])
        simple.set_embeddings(embeddings.astype(theano.config.floatX))

    if c.get('embedding_def_path', ''):
        embeddings = np.load(c['embedding_def_path'])
        simple.set_def_embeddings(embeddings.astype(theano.config.floatX))

    return simple, data, dict, retrieval, vocab