def multitask_attention_model(output_size,
                              pos_vocab_size,
                              lex_vocab_size,
                              config_params,
                              visualize=False,
                              plot=False):
    hidden_size = int(config_params['hidden_size'])
    batch_size = int(config_params['batch_size'])
    embedding_size = 768
    max_seq_len = 512

    in_id = Input(shape=(max_seq_len, ), name="input_ids")
    in_mask = Input(shape=(max_seq_len, ), name="input_masks")
    in_segment = Input(shape=(max_seq_len, ), name="segment_ids")
    bert_inputs = [in_id, in_mask, in_segment]

    bert_output_ = BertEmbeddingLayer(n_fine_tune_layers=3,
                                      pooling="mean")(bert_inputs)
    bert_output = Reshape((max_seq_len, embedding_size))(bert_output_)

    in_mask = Input(shape=(None, output_size),
                    batch_size=batch_size,
                    name='Candidate_Synsets_Mask')
    bert_inputs.append(in_mask)

    bilstm = Bidirectional(LSTM(hidden_size,
                                dropout=0.2,
                                recurrent_dropout=0.2,
                                return_sequences=True,
                                input_shape=(None, None, embedding_size)),
                           merge_mode='sum')(bert_output)

    attention = SeqSelfAttention(units=128,
                                 attention_activation='sigmoid',
                                 name='Attention')(bilstm)

    logits = TimeDistributed(Dense(output_size))(attention)
    logits_mask = Add()([logits, in_mask])

    pos_logits = TimeDistributed(Dense(pos_vocab_size),
                                 name='POS_logits')(attention)
    lex_logits = TimeDistributed(Dense(lex_vocab_size),
                                 name='LEX_logits')(attention)

    wsd_output = Softmax(name="WSD_output")(logits_mask)
    pos_output = Softmax(name="POS_output")(pos_logits)
    lex_output = Softmax(name="LEX_output")(lex_logits)

    model = Model(inputs=bert_inputs,
                  outputs=[wsd_output, pos_output, lex_output],
                  name='Bert_BiLSTM_ATT_MultiTask')

    model.compile(loss="sparse_categorical_crossentropy",
                  optimizer=Adadelta(),
                  metrics=['acc'])

    visualize_plot_mdl(visualize, plot, model)

    return model
def attention_model(vocabulary_size, config_params,
                    output_size, pos_vocab_size,
                    lex_vocab_size, visualize=False,
                    plot=False, tokenizer=None):
    hidden_size = int(config_params['hidden_size'])
    batch_size = int(config_params['batch_size'])

    input_type = 'string' if tokenizer is not None else None
    in_sentences = Input(shape=(None,), dtype=input_type,
                         batch_size=batch_size)

    if tokenizer is not None:
        embedding = ElmoEmbeddingLayer()(in_sentences)
        embedding_size = 1024
    else:
        embedding_size = int(config_params['embedding_size'])
        embedding = Embedding(input_dim=vocabulary_size,
                              output_dim=embedding_size,
                              mask_zero=True,
                              name="Embeddings")(in_sentences)

    bilstm = Bidirectional(LSTM(hidden_size, dropout=0.2,
                                recurrent_dropout=0.2,
                                return_sequences=True,
                                input_shape=(None, None, embedding_size)
                                ),
                           merge_mode='sum')(embedding)

    attention = SeqSelfAttention(attention_activation='sigmoid',
                                 name='Attention')(bilstm)

    logits = TimeDistributed(Dense(output_size))(attention)
    in_mask = Input(shape=(None, output_size), batch_size=batch_size,
                    name='Candidate_Synsets_Mask')
    logits_mask = Add()([logits, in_mask])

    pos_logits = TimeDistributed(Dense(pos_vocab_size),
                                 name='POS_logits')(attention)
    lex_logits = TimeDistributed(Dense(lex_vocab_size),
                                 name='LEX_logits')(attention)

    wsd_output = Softmax(name="WSD_output")(logits_mask)
    pos_output = Softmax(name="POS_output")(pos_logits)
    lex_output = Softmax(name="LEX_output")(lex_logits)

    model = Model(inputs=[in_sentences, in_mask],
                  outputs=[wsd_output, pos_output, lex_output],
                  name='BiLSTM_ATT_MultiTask')

    model.compile(loss="sparse_categorical_crossentropy",
                  optimizer=Adadelta(), metrics=['acc'])

    visualize_plot_mdl(visualize, plot, model)

    return model
def attention_model(vocabulary_size, config_params,
                    output_size, weights=None,
                    tokenizer=None, visualize=False, plot=False):
    hidden_size = config_params['hidden_size']
    batch_size = int(config_params['batch_size'])

    input_type = 'string' if tokenizer is not None else None
    in_sentences = Input(shape=(None,), dtype=input_type,
                         batch_size=batch_size)
    in_mask = Input(shape=(None, output_size), batch_size=batch_size,
                    name='Candidate_Synsets_Mask')

    if tokenizer is not None:
        embedding = ElmoEmbeddingLayer()(in_sentences)
        embedding_size = 1024
    elif weights is not None:
        embedding_size = weights.shape[1]
        train = False  # To fine-tune pretrained embeddings or not
        embedding = Embedding(input_dim=output_size, output_dim=embedding_size,
                              weights=[weights], trainable=train,
                              mask_zero=True)(in_sentences)
    else:
        embedding_size = int(config_params['embedding_size'])
        embedding = Embedding(input_dim=vocabulary_size,
                              output_dim=embedding_size,
                              mask_zero=True,
                              name="Embeddings")(in_sentences)

    bilstm = Bidirectional(LSTM(hidden_size, dropout=0.2,
                                recurrent_dropout=0.2,
                                return_sequences=True,
                                input_shape=(None, None, embedding_size)
                                ),
                           merge_mode='sum')(embedding)

    attention = SeqSelfAttention(attention_activation='sigmoid',
                                 name='Attention')(bilstm)

    logits = TimeDistributed(Dense(output_size))(attention)
    logits_mask = Add()([logits, in_mask])

    output = Softmax()(logits_mask)

    model = Model(inputs=[in_sentences, in_mask],
                  outputs=output, name="SensEmbed_Attention")

    model.compile(loss="sparse_categorical_crossentropy",
                  optimizer=Adadelta(), metrics=['acc'])

    visualize_plot_mdl(visualize, plot, model)

    return model
def baseline_model(vocabulary_size,
                   config_params,
                   output_size,
                   tokenizer=None,
                   visualize=False,
                   plot=False):
    name = 'Baseline'
    hidden_size = int(config_params['hidden_size'])
    batch_size = int(config_params['batch_size'])

    input_type = 'string' if tokenizer is not None else None
    in_sentences = Input(shape=(None, ),
                         dtype=input_type,
                         batch_size=batch_size,
                         name='Input')

    if tokenizer is not None:
        embedding = ElmoEmbeddingLayer()(in_sentences)
        embedding_size = 1024
        name = f'Elmo_{name}'
    else:
        embedding_size = int(config_params['embedding_size'])
        embedding = Embedding(input_dim=vocabulary_size,
                              output_dim=embedding_size,
                              mask_zero=True,
                              name="Embeddings")(in_sentences)

    bilstm = Bidirectional(LSTM(hidden_size,
                                dropout=0.2,
                                recurrent_dropout=0.2,
                                return_sequences=True,
                                input_shape=(None, None, embedding_size)),
                           merge_mode='sum')(embedding)

    logits = TimeDistributed(Dense(output_size))(bilstm)

    in_mask = Input(shape=(None, output_size),
                    batch_size=batch_size,
                    name='Candidate_Synsets_Mask')

    logits_mask = Add()([logits, in_mask])
    output = Softmax()(logits_mask)

    model = Model(inputs=[in_sentences, in_mask], outputs=output, name=name)

    model.compile(loss="sparse_categorical_crossentropy",
                  optimizer=Adadelta(),
                  metrics=['acc'])

    visualize_plot_mdl(visualize, plot, model)

    return model
def attention_model(output_size,
                    max_seq_len,
                    config_params,
                    visualize=False,
                    plot=False):
    embedding_size = 768
    hidden_size = int(config_params['hidden_size'])
    batch_size = int(config_params['batch_size'])

    in_id = Input(shape=(max_seq_len, ), name="input_ids")
    in_mask = Input(shape=(max_seq_len, ), name="input_masks")
    in_segment = Input(shape=(max_seq_len, ), name="segment_ids")
    bert_inputs = [in_id, in_mask, in_segment]

    bert_output_ = BertEmbeddingLayer(n_fine_tune_layers=3,
                                      pooling="mean")(bert_inputs)
    bert_output = Reshape((max_seq_len, embedding_size))(bert_output_)

    bilstm = Bidirectional(
        LSTM(hidden_size,
             dropout=0.2,
             recurrent_dropout=0.2,
             return_sequences=True))(bert_output)
    attention = SeqSelfAttention(attention_activation='sigmoid',
                                 name='Attention')(bilstm)

    logits = TimeDistributed(Dense(output_size))(attention)

    in_mask = Input(shape=(None, output_size),
                    batch_size=batch_size,
                    name='Candidate_Synsets_Mask')
    bert_inputs.append(in_mask)

    logits_mask = Add()([logits, in_mask])
    output = Softmax()(logits_mask)

    mdl = Model(inputs=bert_inputs,
                outputs=output,
                name="Bert_Attention_BiLSTM")

    mdl.compile(loss="sparse_categorical_crossentropy",
                optimizer=Adadelta(),
                metrics=['acc'])

    visualize_plot_mdl(visualize, plot, mdl)

    return mdl
def multitask_seq2seq_model(output_size,
                            pos_vocab_size,
                            lex_vocab_size,
                            config_params,
                            visualize=False,
                            plot=False):
    hidden_size = int(config_params['hidden_size'])
    batch_size = int(config_params['batch_size'])
    embedding_size = 768
    max_seq_len = 512
    in_id = Input(shape=(max_seq_len, ), name="input_ids")
    in_mask = Input(shape=(max_seq_len, ), name="input_masks")
    in_segment = Input(shape=(max_seq_len, ), name="segment_ids")
    bert_inputs_ = [in_id, in_mask, in_segment]

    bert_output_ = BertEmbeddingLayer(n_fine_tune_layers=3,
                                      pooling="mean")(bert_inputs_)
    bert_output = Reshape((max_seq_len, embedding_size))(bert_output_)

    input_mask = Input(shape=(None, output_size),
                       batch_size=batch_size,
                       name='Candidate_Synsets_Mask')

    bert_inputs_.append(input_mask)

    bilstm, forward_h, _, backward_h, _ = Bidirectional(
        LSTM(hidden_size,
             return_sequences=True,
             return_state=True,
             dropout=0.2,
             recurrent_dropout=0.2,
             input_shape=(None, None, embedding_size)),
        merge_mode='sum',
        name='Encoder_BiLSTM')(bert_output)

    state_h = Concatenate()([forward_h, backward_h])

    context = SeqSelfAttention(units=128)([bilstm, state_h])

    concat = Concatenate()([bilstm, context])

    decoder_fwd_lstm = LSTM(hidden_size,
                            dropout=0.2,
                            recurrent_dropout=0.2,
                            return_sequences=True,
                            input_shape=(None, None, embedding_size),
                            name='Decoder_FWD_LSTM')(concat)

    decoder_bck_lstm = LSTM(hidden_size,
                            dropout=0.2,
                            recurrent_dropout=0.2,
                            return_sequences=True,
                            input_shape=(None, None, embedding_size),
                            go_backwards=True,
                            name='Decoder_BWD_LSTM')(decoder_fwd_lstm)

    decoder_bilstm = Concatenate()([decoder_fwd_lstm, decoder_bck_lstm])

    logits = TimeDistributed(Dense(output_size),
                             name='WSD_logits')(decoder_bilstm)
    logits_mask = Add(name="Masked_logits")([logits, input_mask])

    pos_logits = TimeDistributed(Dense(pos_vocab_size),
                                 name='POS_logits')(decoder_bilstm)
    lex_logits = TimeDistributed(Dense(lex_vocab_size),
                                 name='LEX_logits')(decoder_bilstm)

    wsd_output = Softmax(name="WSD_output")(logits_mask)
    pos_output = Softmax(name="POS_output")(pos_logits)
    lex_output = Softmax(name="LEX_output")(lex_logits)

    model = Model(inputs=bert_inputs_,
                  outputs=[wsd_output, pos_output, lex_output],
                  name='Bert_Attention_Seq2Seq_MultiTask')

    model.compile(loss="sparse_categorical_crossentropy",
                  optimizer=Adadelta(),
                  metrics=['accuracy'])

    visualize_plot_mdl(visualize, plot, model)

    return model
def seq2seq_model(output_size,
                  max_seq_len,
                  config_params,
                  visualize=False,
                  plot=False):
    drop, rdrop = 0.2, 0.2
    embedding_size = 768
    hidden_size = int(config_params['hidden_size'])
    batch_size = int(config_params['batch_size'])

    in_id = Input(shape=(max_seq_len, ), name="input_ids")
    in_mask = Input(shape=(max_seq_len, ), name="input_masks")
    in_segment = Input(shape=(max_seq_len, ), name="segment_ids")
    bert_inputs = [in_id, in_mask, in_segment]

    bert_output_ = BertEmbeddingLayer(n_fine_tune_layers=3,
                                      pooling="mean")(bert_inputs)
    bert_output = Reshape((max_seq_len, embedding_size))(bert_output_)

    in_mask = Input(shape=(None, output_size),
                    batch_size=batch_size,
                    name='Candidate_Synsets_Mask')

    bert_inputs.append(in_mask)

    encoder_bilstm = Bidirectional(LSTM(hidden_size,
                                        dropout=drop,
                                        recurrent_dropout=rdrop,
                                        return_sequences=True,
                                        return_state=True,
                                        input_shape=(None, None,
                                                     embedding_size)),
                                   merge_mode='sum',
                                   name='Encoder_BiLSTM_1')(bert_output)

    encoder_bilstm2 = Bidirectional(LSTM(hidden_size,
                                         dropout=drop,
                                         recurrent_dropout=rdrop,
                                         return_sequences=True,
                                         return_state=True,
                                         input_shape=(None, None,
                                                      embedding_size)),
                                    merge_mode='sum',
                                    name='Encoder_BiLSTM_2')

    (encoder_outputs, forward_h, forward_c, backward_h,
     backward_c) = encoder_bilstm2(encoder_bilstm)

    state_h = Concatenate()([forward_h, backward_h])
    state_c = Concatenate()([forward_c, backward_c])
    encoder_states = [state_h, state_c]

    encoder_attention = SeqSelfAttention(attention_activation='sigmoid',
                                         name='Attention')(encoder_outputs)

    decoder_fwd_lstm, _, _ = LSTM(hidden_size,
                                  dropout=drop,
                                  recurrent_dropout=rdrop,
                                  return_sequences=True,
                                  return_state=True,
                                  input_shape=(None, None, embedding_size),
                                  name='Decoder_FWD_LSTM')(
                                      encoder_attention,
                                      initial_state=[forward_h, backward_h])

    decoder_bck_lstm, _, _ = LSTM(hidden_size,
                                  dropout=drop,
                                  recurrent_dropout=rdrop,
                                  return_sequences=True,
                                  return_state=True,
                                  input_shape=(None, None, embedding_size),
                                  go_backwards=True,
                                  name='Decoder_BWD_LSTM')(decoder_fwd_lstm)

    decoder_bilstm = Concatenate()([decoder_fwd_lstm, decoder_bck_lstm])

    decoder_output = TimeDistributed(Dense(output_size),
                                     name='TimeDist_Dense')(decoder_bilstm)

    logits_mask = Add()([decoder_output, in_mask])

    decoder_outputs = Softmax()(logits_mask)

    model = Model(bert_inputs,
                  outputs=decoder_outputs,
                  name="Bert_Attention_Seq2Seq")

    model.compile(loss="sparse_categorical_crossentropy",
                  optimizer=Adadelta(),
                  metrics=['acc'])

    visualize_plot_mdl(visualize, plot, model)

    return model
Example #8
0
def baseline_model(vocabulary_size,
                   config_params,
                   output_size,
                   lex_output_size,
                   dom_output_size,
                   tokenizer=None,
                   visualize=False,
                   plot=False):
    hidden_size = int(config_params['hidden_size'])
    batch_size = int(config_params['batch_size'])

    input_type = 'string' if tokenizer is not None else None
    in_sentences = Input(shape=(None, ),
                         dtype=input_type,
                         batch_size=batch_size,
                         name='Input')

    if tokenizer is not None:
        embedding = ElmoEmbeddingLayer()(in_sentences)
        embedding_size = 1024
    else:
        embedding_size = int(config_params['embedding_size'])
        embedding = Embedding(input_dim=vocabulary_size,
                              output_dim=embedding_size,
                              mask_zero=True,
                              name="Embeddings")(in_sentences)

    bilstm = Bidirectional(LSTM(hidden_size,
                                dropout=0.2,
                                recurrent_dropout=0.2,
                                return_sequences=True,
                                input_shape=(None, None, embedding_size)),
                           merge_mode='sum')(embedding)

    stacked_bilstm = Bidirectional(LSTM(hidden_size,
                                        dropout=0.2,
                                        recurrent_dropout=0.2,
                                        return_sequences=True,
                                        input_shape=(None, None,
                                                     embedding_size)),
                                   merge_mode='sum')(bilstm)

    lex_logits = TimeDistributed(Dense(lex_output_size),
                                 name='LEX_logits')(bilstm)
    dom_logits = TimeDistributed(Dense(dom_output_size),
                                 name='DOM_logits')(bilstm)
    wsd_logits = TimeDistributed(Dense(output_size),
                                 name='WSD_logits')(stacked_bilstm)

    in_mask = Input(shape=(None, output_size),
                    batch_size=batch_size,
                    name='Candidate_Synsets_Mask')
    logits_mask = Add()([wsd_logits, in_mask])

    wsd_output = Softmax(name="WSD_output")(logits_mask)
    lex_output = Softmax(name="LEX_output")(lex_logits)
    dom_output = Softmax(name="DOM_output")(dom_logits)

    model = Model(inputs=[in_sentences, in_mask],
                  outputs=[wsd_output, dom_output, lex_output],
                  name="Hierarchical")

    model.compile(loss="sparse_categorical_crossentropy",
                  optimizer=Adadelta(),
                  metrics=['acc'])

    visualize_plot_mdl(visualize, plot, model)

    return model
def seq2seq_model(vocabulary_size, config_params, output_size,
                  pos_vocab_size, lex_vocab_size, tokenizer=None,
                  visualize=False, plot=False):
    hidden_size = int(config_params['hidden_size'])
    batch_size = int(config_params['batch_size'])
    embedding_size = int(config_params['embedding_size'])

    input_type = 'string' if tokenizer is not None else None
    in_sentences = Input(shape=(None,), dtype=input_type,
                         batch_size=batch_size, name='Input')

    if tokenizer is not None:
        embeddings = ElmoEmbeddingLayer()(in_sentences)
        embedding_size = 1024
    else:
        embeddings = Embedding(input_dim=vocabulary_size,
                               output_dim=embedding_size,
                               mask_zero=True,
                               name="Embeddings")(in_sentences)
    bilstm, forward_h, _, backward_h, _ = Bidirectional(LSTM(hidden_size, return_sequences=True,
                                                             return_state=True, dropout=0.2, recurrent_dropout=0.2,
                                                             input_shape=(None, None, embedding_size)),
                                                        merge_mode='sum',
                                                        name='Encoder_BiLSTM')(embeddings)
    state_h = Concatenate()([forward_h, backward_h])

    encoder_attention = SeqSelfAttention(attention_activation='sigmoid',
                                         name='Attention')([bilstm, state_h])

    concat = Concatenate()([encoder_attention, bilstm])

    decoder_fwd_lstm, _, _ = LSTM(hidden_size, dropout=0.2,
                                  recurrent_dropout=0.2,
                                  return_sequences=True,
                                  input_shape=(None, None, embedding_size),
                                  name='Decoder_FWD_LSTM')(concat)

    decoder_bck_lstm, _, _ = LSTM(hidden_size,
                                  dropout=0.2,
                                  recurrent_dropout=0.2,
                                  return_sequences=True,
                                  input_shape=(None, None, embedding_size),
                                  go_backwards=True,
                                  name='Decoder_BWD_LSTM')(decoder_fwd_lstm)

    decoder_bilstm = Concatenate()([decoder_fwd_lstm, decoder_bck_lstm])

    logits = TimeDistributed(
        Dense(output_size), name='WSD_logits')(decoder_bilstm)
    in_mask = Input(shape=(None, output_size),
                    batch_size=batch_size, name='Candidate_Synsets_Mask')

    logits_mask = Add(name="Masked logits")([logits, in_mask])
    pos_logits = TimeDistributed(Dense(pos_vocab_size),
                                 name='POS_logits')(decoder_bilstm)
    lex_logits = TimeDistributed(Dense(lex_vocab_size),
                                 name='LEX_logits')(decoder_bilstm)

    wsd_output = Softmax(name="WSD_output")(logits_mask)
    pos_output = Softmax(name="POS_output")(pos_logits)
    lex_output = Softmax(name="LEX_output")(lex_logits)

    model = Model(inputs=[in_sentences, in_mask],
                  outputs=[wsd_output, pos_output, lex_output],
                  name='Seq2Seq_MultiTask')

    model.compile(loss="sparse_categorical_crossentropy",
                  optimizer=Adadelta(), metrics=['acc'])

    visualize_plot_mdl(visualize, plot, model)

    return model
def seq2seq_model(vocabulary_size, config_params, output_size, weights=None,
                  tokenizer=None, visualize=False, plot=False):
    drop, rdrop = 0.2, 0.2
    hidden_size = int(config_params['hidden_size'])
    batch_size = int(config_params['batch_size'])

    input_type = 'string' if tokenizer is not None else None
    encoder_inputs = Input(shape=(None,), dtype=input_type,
                           batch_size=batch_size)
    in_mask = Input(shape=(None, output_size),
                    batch_size=batch_size, name='Candidate_Synsets_Mask')

    if tokenizer is not None:
        encoder_embeddings = ElmoEmbeddingLayer()(encoder_inputs)
        embedding_size = 1024
    elif weights is not None:
        embedding_size = weights.shape[1]
        train = True  # To fine-tune pretrained embeddings or not
        encoder_embeddings = Embedding(input_dim=output_size, output_dim=embedding_size, weights=[
                                       weights], trainable=train, mask_zero=True)(encoder_inputs)
    else:
        embedding_size = int(config_params['embedding_size'])
        encoder_embeddings = Embedding(
            input_dim=vocabulary_size, output_dim=embedding_size,
            mask_zero=True, name="Embeddings")(encoder_inputs)

    encoder_bilstm = Bidirectional(LSTM(hidden_size, dropout=drop,
                                        recurrent_dropout=rdrop,
                                        return_sequences=True,
                                        return_state=True,
                                        input_shape=(
                                            None, None, embedding_size)
                                        ),
                                   merge_mode='sum',
                                   name='Encoder_BiLSTM_1')(encoder_embeddings)

    encoder_bilstm2 = Bidirectional(LSTM(hidden_size, dropout=drop,
                                         recurrent_dropout=rdrop,
                                         return_sequences=True,
                                         return_state=True,
                                         input_shape=(
                                             None, None, embedding_size)
                                         ),
                                    merge_mode='sum', name='Encoder_BiLSTM_2')

    (encoder_outputs, forward_h, forward_c, backward_h,
     backward_c) = encoder_bilstm2(encoder_bilstm)

    state_h = Concatenate()([forward_h, backward_h])
    state_c = Concatenate()([forward_c, backward_c])
    encoder_states = [state_h, state_c]

    encoder_attention = SeqSelfAttention(
        attention_activation='sigmoid', name='Attention')(encoder_outputs)

    decoder_fwd_lstm, _, _ = LSTM(hidden_size, dropout=drop,
                                  recurrent_dropout=rdrop,
                                  return_sequences=True,
                                  return_state=True,
                                  input_shape=(None, None, embedding_size),
                                  name='Decoder_FWD_LSTM')(encoder_attention,
                                                           initial_state=[forward_h, backward_h])

    decoder_bck_lstm, _, _ = LSTM(hidden_size,
                                  dropout=drop,
                                  recurrent_dropout=rdrop,
                                  return_sequences=True,
                                  return_state=True,
                                  input_shape=(None, None, embedding_size),
                                  go_backwards=True,
                                  name='Decoder_BWD_LSTM')(decoder_fwd_lstm)

    decoder_bilstm = Concatenate()([decoder_fwd_lstm, decoder_bck_lstm])

    decoder_output = TimeDistributed(Dense(output_size),
                                     name='TimeDist_Dense')(decoder_bilstm)

    logits_mask = Add()([decoder_output, in_mask])

    decoder_outputs = Softmax()(logits_mask)

    model = Model([encoder_inputs, in_mask],
                  outputs=decoder_outputs, name="SensEmbed_Seq2Seq_Attention")

    model.compile(loss="sparse_categorical_crossentropy",
                  optimizer=Adadelta(), metrics=['acc'])

    visualize_plot_mdl(visualize, plot, model)

    return model
def multitask_baseline_model(vocabulary_size, config_params,
                             output_size, pos_vocab_size,
                             lex_vocab_size, weights=None,
                             tokenizer=None, visualize=False,
                             plot=False):
    hidden_size = int(config_params['hidden_size'])
    batch_size = int(config_params['batch_size'])

    input_type = 'string' if tokenizer is not None else None
    in_sentences = Input(shape=(None,), dtype=input_type,
                         batch_size=batch_size, name='Input')

    if tokenizer is not None:
        embedding = ElmoEmbeddingLayer()(in_sentences)
        embedding_size = 1024
    elif weights is not None:
        embedding_size = weights.shape[1]
        train = False  # To fine-tune pretrained embeddings or not
        embedding = Embedding(input_dim=output_size, output_dim=embedding_size,
                              weights=[weights], trainable=train,
                              mask_zero=True)(in_sentences)
    else:
        embedding_size = int(config_params['embedding_size'])
        embedding = Embedding(input_dim=vocabulary_size,
                              output_dim=embedding_size,
                              mask_zero=True,
                              name="Embeddings")(in_sentences)

    bilstm = Bidirectional(LSTM(hidden_size, dropout=0.2,
                                recurrent_dropout=0.2,
                                return_sequences=True,
                                input_shape=(None, None, embedding_size)
                                ),
                           merge_mode='sum')(embedding)

    logits = TimeDistributed(Dense(output_size))(bilstm)

    in_mask = Input(shape=(None, output_size), batch_size=batch_size,
                    name='Candidate_Synsets_Mask')

    logits_mask = Add()([logits, in_mask])

    pos_logits = TimeDistributed(Dense(pos_vocab_size),
                                 name='POS_logits')(bilstm)
    lex_logits = TimeDistributed(Dense(lex_vocab_size),
                                 name='LEX_logits')(bilstm)

    wsd_output = Softmax(name="WSD_output")(logits_mask)
    pos_output = Softmax(name="POS_output")(pos_logits)
    lex_output = Softmax(name="LEX_output")(lex_logits)

    model = Model(inputs=[in_sentences, in_mask],
                  outputs=[wsd_output, pos_output, lex_output],
                  name='SensEmbed_BiLSTM_MultiTask')

    model.compile(loss="sparse_categorical_crossentropy",
                  optimizer=Adadelta(), metrics=['acc'])

    visualize_plot_mdl(visualize, plot, model)

    return model