Ejemplo n.º 1
0
def build_task_towers(hp: HyperParameters,
                      n_tasks: int,
                      min_layers: int=1,
                      max_layers: int=5,
                      min_units_per_layer: int=8,
                      max_units_per_layer: int=40
                      ):
    """ Helper method to build task specific networks """
    task_towers = []
    n_layers = hp.Int(name="n_layers_tasks",
                      min_value=min_layers,
                      max_value=max_layers)
    for j in range(n_tasks):
        architecture = []
        for i in range(n_layers):
            n_units = hp.Int(name="n_units_layer_{0}_task_{1}".format(i, j),
                             min_value=min_units_per_layer,
                             max_value=max_units_per_layer)
            architecture.append(n_units)
        architecture.append(1)
        task_towers.append(MLP(architecture,
                               hp["hidden_layer_activation"],
                               hp["output_layer_activation"])
                          )
    return task_towers
Ejemplo n.º 2
0
def build_hyper_cross_stitched_model(hp: HyperParameters, n_tasks: int,
                                     all_columns: List[str],
                                     cat_features_dim: Dict[str, int],
                                     restricted_hyperparameter_search: bool):
    """
    Build model for Cross Stitched networks

    Parameters
    ----------
    hp: instance of HyperParameters
        Hyper-Parameters that define architecture and training of neural networks

    n_tasks: int
        Number of tasks

    all_columns: list
        Names of the features

    cat_features_dim: dict
        Dictionary that maps from the name of categorical feature
        to its dimensionality.

    restricted_hyperparameter_search: bool
        If True, then fixes following hyperparameters and does not optimize them.
        - batch_size = 1024
        - hidden_layer_activation = relu
        - optimizer = sgd

    Returns
    -------
    model: tensorflow.keras.models.Model
        Compiled Cross Stitched Networks Model
    """
    # define activation functions and preproceing layer
    build_activation_functions(hp, restricted_hyperparameter_search)
    preprocessing_layer = build_preprocessing_layer_uci_income(
        hp, all_columns, cat_features_dim)
    # propagate input through preprocesing layer
    input_layer = Input(shape=(len(all_columns), ))
    x = preprocessing_layer(input_layer)

    # build cross-stitch network model
    n_layers = hp.Int("number_of_hidden_layers", min_value=2, max_value=8)
    for i in range(n_layers):
        n_units = hp.Int("n_units_layer_{0}".format(i),
                         min_value=8,
                         max_value=40)
        dense_layers_output = [
            Dense(n_units, hp["hidden_layer_activation"])(x)
            for _ in range(n_tasks)
        ]
        x = CrossStitchBlock()(dense_layers_output)
    output_layers = [
        Dense(1, hp['output_layer_activation'])(x) for _ in range(n_tasks)
    ]
    model = Model(inputs=input_layer, outputs=output_layers)
    return model
Ejemplo n.º 3
0
def build_experts(hp: HyperParameters):
    """ Helper method to build expert networks for OMOE and MMOE"""
    architecture = []
    n_experts = hp.Int("n_experts", 4, 10, default=6)
    n_layers = hp.Int("n_layers_experts", 2, 4, default=2)
    for i in range(n_layers):
        n_units = hp.Int("n_units_experts_{0}".format(i), 10, 20)
        architecture.append( n_units )
    return [MLP(architecture, hp["hidden_layer_activation"]) for _ in range(n_experts)]
Ejemplo n.º 4
0
def build_mtl_shared_bottom(hp: HyperParameters, n_tasks: int,
                            all_columns: List[str],
                            cat_features_dim: Dict[str, int],
                            restricted_hyperparameter_search: bool):
    """
    Build model for L2 constrained multi-task learning model

    Parameters
    ----------
    hp: instance of HyperParameters
        Hyper-Parameters that define architecture and training of neural networks

    n_tasks: int
        Number of tasks

    all_columns: list
        Names of the features

    cat_features_dim: dict
        Dictionary that maps from the name of categorical feature
        to its dimensionality.

    restricted_hyperparameter_search: bool
        If True, then fixes following hyperparameters and does not optimize them.
        - batch_size = 1024
        - hidden_layer_activation = relu
        - optimizer = sgd

    Returns
    -------
    model: tensorflow.keras.models.Model
        Compiled standard MTL model with hard parameter sharing
    """
    # define activation functions and preproceing layer
    build_activation_functions(hp, restricted_hyperparameter_search)
    preprocessing_layer = build_preprocessing_layer_uci_income(
        hp, all_columns, cat_features_dim)
    # propagate input through preprocesing layer
    input_layer = Input(shape=(len(all_columns), ))
    x = preprocessing_layer(input_layer)

    # build shared layers
    architecture = []
    n_layers = hp.Int("n_layers_experts", 2, 4, default=2)
    for i in range(n_layers):
        n_units = hp.Int("n_units_experts_{0}".format(i), 10, 20)
        architecture.append(n_units)
    shared_layers = MLP(architecture, hp["hidden_layer_activation"])
    shared_layers_output = shared_layers(input_layer)

    # task layers
    task_towers = build_task_towers(hp, n_tasks)
    output_layer = [task(shared_layers_output) for task in task_towers]
    model = Model(inputs=input_layer, outputs=output_layer)
    return model
Ejemplo n.º 5
0
    def _build_model(
        hp: HyperParameters,
        input_layer: KerasTensor,
        encoded_layer: KerasTensor,
    ) -> keras.Model:
        """Build the part of the architecture tunable by keras-tuner.

        Note:
            It is a relatively simple dense network, with self-normalizing layers.

        Args:
            hp: hyperparameters passed by the tuner.
            input layer: The input layer of the model.
            encoded_layer: The encoding layer of the model.

        Returns:
            A tunable keras functional model.

        """
        x = encoded_layer
        for i in range(hp.Int("dense_layers", 1, 3, default=2)):
            x = layers.Dense(
                units=hp.Int(f"units_layer_{i + 1}",
                             min_value=32,
                             max_value=256,
                             step=32,
                             default=64),
                activation="selu",
                kernel_initializer=tf.keras.initializers.LecunNormal(),
            )(encoded_layer)
            x = layers.AlphaDropout(0.5)(x)

        output_layer = layers.Dense(1, activation="sigmoid")(x)

        model = keras.Model(input_layer, output_layer)
        model.compile(
            optimizer=keras.optimizers.Adam(
                hp.Choice("learning_rate",
                          values=[1e-2, 1e-3, 1e-4],
                          default=1e-3)),
            loss="binary_crossentropy",
            metrics=[
                "accuracy",
                tfa.metrics.F1Score(num_classes=2,
                                    average="micro",
                                    threshold=0.5,
                                    name="f1_score"),
            ],
        )

        return model
Ejemplo n.º 6
0
def build_model(hp: HyperParameters):
    inputs = tf.keras.Input((15,))
    x = inputs
    y = inputs
    t_dropout = hp.Float('target_dropout', 0.0, 0.5, 0.1, default=0.2)
    p_dropout = hp.Float('pretrain_dropout', 0.0, 0.5, 0.1, default=0.2)

    for i in range(1):
        # hidden layer
        x = tf.keras.layers.Dense(2**hp.Int('target_exponent_{}'.format(i), 5, 8, default=6), activation='relu', kernel_initializer='he_uniform', name='target_dense_{}'.format(i))(x)
        y = tf.keras.layers.Dense(2**hp.Int('pretrain_exponent_{}'.format(i), 5, 8, default=6), activation='relu', kernel_initializer='he_uniform', name='pretrain_dense_{}'.format(i))(y)
        a = tf.keras.layers.Dense(2**hp.Int('adapter_exponent_{}'.format(i), 2, 6, default=4), activation='relu', kernel_initializer='he_uniform', name='target_adapter_{}'.format(i))(y)

        # dropout layer
        x = tf.keras.layers.Dropout(t_dropout, name='target_dropout_{}'.format(i))(x)
        x = tf.keras.layers.concatenate([x, a], name='target_concat_{}'.format(i))
        y = tf.keras.layers.Dropout(p_dropout, name='pretrain_dropout_{}'.format(i))(y)

    x = tf.keras.layers.Dense(18, activation='softmax', dtype='float32', name='target_output')(x)
    y = tf.keras.layers.Dense(18, activation='softmax', dtype='float32', name='pretrain_output')(y)
    model = tf.keras.Model(inputs=inputs, outputs=[x, y])

    return model
Ejemplo n.º 7
0
def build_preprocessing_layer_uci_income(hp: HyperParameters,
                                         all_columns: List[str],
                                         cat_features_dim: Dict[str, int]
                                         ):
    """
    Helper method that builds preprocesing layer for UCI
    Census Income dataset.
    """
    feature_sparsity_threshold = hp.Int("feature_sparsity",
                                        min_value=3,
                                        max_value=10,
                                        default=3)
    return PreprocessingLayer(all_columns,
                              cat_features_dim,
                              feature_sparsity_threshold=feature_sparsity_threshold)
Ejemplo n.º 8
0
def build_model(hp: HyperParameters):
    inputs = tf.keras.Input((15, ))
    x = inputs
    dropout = hp.Float('dropout', 0.0, 0.5, 0.1, default=0.2)
    for i in range(1):
        x = tf.keras.layers.Dense(
            2**hp.Int('exponent_{}'.format(i), 5, 8, default=6), 'relu')(x)
        x = tf.keras.layers.Dropout(dropout)(x)

    x = tf.keras.layers.Dense(18, activation='softmax', dtype='float32')(x)
    model = tf.keras.Model(inputs=inputs, outputs=x)
    model.compile('adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    return model
Ejemplo n.º 9
0
    def define_hp(hp_model = None):
        hp = HyperParameters()
        
        if(hp_model is None):
            hp_model = SmallImagesHP()

        hp.Int(name = 'init',
            min_value = hp_model.init_min,
            max_value = hp_model.init_max,
            step = 32)  
                
        hp.Int(name = 'cnn_layers',
            min_value = hp_model.cnn_layers_min,
            max_value = hp_model.cnn_layers_max,
            step = 1)

        for i in range(1, hp_model.cnn_layers_max + 1):
            hp.Int(name = 'cnn_{0}'.format(i),
                min_value = hp_model.cnn_min,
                max_value = hp_model.cnn_max,
                step = 32)   

        hp.Int(name = 'cnn2_layers',
            min_value = hp_model.cnn2_layers_min,
            max_value = hp_model.cnn2_layers_max,
            step = 1)

        for i in range(1, hp_model.cnn2_layers_max + 1):
            hp.Int(name = 'cnn2_{0}'.format(i),
                min_value = hp_model.cnn2_min,
                max_value = hp_model.cnn2_max,
                step = 32)            

        hp.Int(name = 'dense',
            min_value = hp_model.dense_min,
            max_value = hp_model.dense_max,
            step = hp_model.dense_step)

        hp.Int(name = 'dense2',
            min_value = hp_model.dense2_min,
            max_value = hp_model.dense2_max,
            step = hp_model.dense2_step)

        hp.Choice('dropout', hp_model.dropout)
        hp.Choice('learning_rate', hp_model.learning_rate)

        return hp
    def construct_model(self,
                        tuned_params: Dict[str, Union[int, float]],
                        hps: HyperParameters = None) -> Model:
        hpf = HyperParameterFactory(self.default_parameters_values,
                                    tuned_params, hps)
        filter_0 = hpf.get_choice(FILTER0_NAME, [4, 8, 16, 32])
        filter_1 = hpf.get_choice(FILTER1_NAME, [32, 48, 64])
        filter_2 = hpf.get_choice(FILTER2_NAME, [64, 96, 128])
        max_pool_0 = hpf.get_choice(MAX_POOL_SIZE0, [1, 2])
        max_pool_1 = hpf.get_choice(MAX_POOL_SIZE1, [1, 2])
        max_pool_2 = hpf.get_choice(MAX_POOL_SIZE2, [1, 2, 4, 8])
        dense = hpf.get_int(
            DENSE_NAME, lambda default: hps.Int(
                DENSE_NAME, 32, 128, step=8, default=default))
        lr = hpf.get_choice(LEARNING_RATE_NAME, [1e-2, 1e-3, 1e-4])

        model = Sequential([
            Input(name='Input', shape=(12, 12, 7)),
            Conv2D(filter_0,
                   2,
                   strides=1,
                   activation=tf.nn.relu,
                   name='Conv2D_0'),
            MaxPooling2D(max_pool_0, name='MaxPool_0'),
            Conv2D(filter_1,
                   3,
                   strides=1,
                   activation=tf.nn.relu,
                   name='Conv2D_1'),
            MaxPooling2D(max_pool_1, name='MaxPool_1'),
            # Conv2D(self.get_param_value(FILTER2_NAME, tuned_params), 2, strides=1, activation=tf.nn.relu,
            #       name='Conv2D_2'),
            # MaxPooling2D(self.get_param_value(MAX_POOL_SIZE2, tuned_params), name='MaxPool_2'),
            Flatten(name='Flatten'),
            Dropout(0.1, name='Dropout'),
            Dense(dense, activation=tf.nn.relu, name='dense'),
            Dense(5, activation=tf.nn.softmax, name='Output'),
        ])
        loss_fn = tf.keras.losses.CategoricalCrossentropy()
        opt = tf.keras.optimizers.Adam(learning_rate=lr)
        model.compile(optimizer=opt,
                      loss=loss_fn,
                      metrics=[tf.keras.metrics.categorical_accuracy])
        return model
Ejemplo n.º 11
0
    def build_model(hp: kt.HyperParameters, use_avs_model: bool = False) -> Model:
        batch_size = config.generation.batch_size if stateful else None
        layer_names = name_generator('layer')

        inputs = {}
        per_stream = {}

        for col in seq.x_cols:
            shape = None, *seq.shapes[col][2:]
            inputs[col] = layers.Input(batch_size=batch_size, shape=shape, name=col)
            per_stream[f'{col}'] = inputs[col]

        per_stream_list = list(per_stream.values())
        x = forgiving_concatenate(inputs=per_stream_list, axis=-1, name=layer_names.__next__(), )

        for i in range(hp.Int('TEST', 2, 8)):
            x = layers.LSTM(64, return_sequences=True)(x)

        outputs = {}
        loss = {}
        for col in seq.y_cols:
            if col in seq.categorical_cols:
                shape = seq.shapes[col][-1]
                outputs[col] = layers.TimeDistributed(layers.Dense(shape, activation='softmax'), name=col)(x)
                loss[col] = keras.losses.CategoricalCrossentropy()
            if col in seq.regression_cols:
                shape = seq.shapes[col][-1]
                outputs[col] = layers.TimeDistributed(layers.Dense(shape, activation=None), name=col)(x)
                loss[col] = 'mse'

        if config.training.AVS_proxy_ratio == 0:
            logging.log(logging.WARNING, f'Not using AVSModel with superior optimizer due to '
                                         f'{config.training.AVS_proxy_ratio=}.')
        model = Model(inputs=inputs, outputs=outputs)
        opt = keras.optimizers.Adam()

        model.compile(
            optimizer=opt,
            loss=loss,
            metrics=['acc'],
        )

        return model
Ejemplo n.º 12
0
    def build_model(hp: kt.HyperParameters, use_avs_model: bool = False):
        batch_size = config.generation.batch_size if stateful else None
        layer_names = name_generator('layer')

        inputs = {}
        last_layer = []

        for col in seq.x_cols:
            shape = None, *seq.shapes[col][2:]
            inputs[col] = layers.Input(batch_size=batch_size, shape=shape, name=col)
            last_layer.append(inputs[col])

        random.seed(43)
        for i in range(hp.Int(f'lstm_layers', 2, 7)):
            outs = []
            depth = hp.Int(f'depth_{i}', 4, 64, sampling='log')
            connections = min(hp.Int(f'connections_{i}', 1, 3), len(last_layer))
            dropout = hp.Float(f'dropout_{i}', 0, 0.5)
            for width_i in range(hp.Int(f'width_{i}', 1, 16)):
                t = layers.LSTM(depth, return_sequences=True,
                                name=f'lstm{i:03}_{width_i:03}_{layer_names.__next__()}',
                                stateful=stateful, )(
                    forgiving_concatenate(random.sample(last_layer, connections), name=layer_names.__next__()))
                t = layers.BatchNormalization(name=layer_names.__next__())(t)
                t = layers.Dropout(dropout, name=layer_names.__next__())(t)
                outs.append(t)
            last_layer = outs

        x = forgiving_concatenate(last_layer)
        outputs = {}
        loss = {}
        for col in seq.y_cols:
            if col in seq.categorical_cols:
                shape = seq.shapes[col][-1]
                outputs[col] = layers.TimeDistributed(layers.Dense(shape, activation='softmax'), name=col)(x)
                loss[col] = keras.losses.CategoricalCrossentropy(
                    label_smoothing=tf.cast(hp.Float('label_smoothing', 0.0, 0.7), 'float32'),
                )  # does not work well with mixed precision and stateful model
            if col in seq.regression_cols:
                shape = seq.shapes[col][-1]
                outputs[col] = layers.TimeDistributed(layers.Dense(shape, activation=None), name=col)(x)
                loss[col] = 'mse'

        if stateful or config.training.AVS_proxy_ratio == 0:
            if config.training.AVS_proxy_ratio == 0:
                logging.log(logging.WARNING, f'Not using AVSModel with superior optimizer due to '
                                             f'{config.training.AVS_proxy_ratio=}.')
            model = Model(inputs=inputs, outputs=outputs)
            opt = keras.optimizers.Adam()
        else:
            if use_avs_model:
                model = AVSModel(inputs=inputs, outputs=outputs, config=config)
            else:
                model = Model(inputs=inputs, outputs=outputs)

            lr_schedule = FlatCosAnnealSchedule(decay_start=len(seq) * 30,  # Give extra epochs to big batch_size
                                                initial_learning_rate=hp.Choice('initial_learning_rate',
                                                                                [3e-2, 1e-2, 8e-3, ]),
                                                decay_steps=len(seq) * 40,
                                                alpha=0.01, )
            # Ranger hyper params based on https://github.com/fastai/imagenette/blob/master/2020-01-train.md
            opt = tfa.optimizers.RectifiedAdam(learning_rate=lr_schedule,
                                               beta_1=0.95,
                                               beta_2=0.99,
                                               epsilon=1e-6)
            opt = tfa.optimizers.Lookahead(opt, sync_period=6, slow_step_size=0.5)

        model.compile(
            optimizer=opt,
            loss=loss,
            metrics=metrics.create_metrics((not stateful), config),
        )

        return model
Ejemplo n.º 13
0
def fit_sim_model(X_train,
                  X_test,
                  y_train,
                  y_test,
                  model1,
                  model2,
                  results_file='results.csv',
                  embedding_file='sim_embeddings',
                  num_runs=1,
                  hp_file1=None,
                  hp_file2=None,
                  hp_pred_file=None,
                  params=None):
    params = params or PARAMS

    kg1 = pd.read_csv('./data/chemicals0.csv')
    kg2 = pd.read_csv('./data/taxonomy0.csv')

    kg1 = list(zip(kg1['subject'], kg1['predicate'], kg1['object']))
    kg2 = list(zip(kg2['subject'], kg2['predicate'], kg2['object']))

    entities1 = set([s for s, p, o in kg1]) | set([o for s, p, o in kg1])
    relations1 = set([p for s, p, o in kg1])
    entities2 = set([s for s, p, o in kg2]) | set([o for s, p, o in kg2])
    relations2 = set([p for s, p, o in kg2])

    me1 = {k: i for i, k in enumerate(entities1)}
    me2 = {k: i for i, k in enumerate(entities2)}
    mr1 = {k: i for i, k in enumerate(relations1)}
    mr2 = {k: i for i, k in enumerate(relations2)}
    kg1 = [(me1[s], mr1[p], me1[o]) for s, p, o in kg1]
    kg2 = [(me2[s], mr2[p], me2[o]) for s, p, o in kg2]

    output_dim = 1

    X_train, y_train = np.asarray([
        (me1[a], me2[b], float(x)) for a, b, x in X_train
        if a in entities1 and b in entities2
    ]), np.asarray([
        float(x) for x, a in zip(y_train, X_train)
        if a[0] in entities1 and a[1] in entities2
    ])

    X_test, y_test = np.asarray([(me1[a], me2[b], float(x))
                                 for a, b, x in X_test
                                 if a in entities1 and b in entities2
                                 ]), np.asarray([
                                     float(x) for x, a in zip(y_test, X_test)
                                     if a[0] in entities1 and a[1] in entities2
                                 ])

    scores = []
    k_best_predictions = []

    hp = HyperParameters()

    kg_lengths = list(map(len, [kg1, kg2]))
    output_lengths = len(X_train)

    hp.Fixed('num_entities1', len(entities1))
    hp.Fixed('num_entities2', len(entities2))
    hp.Fixed('num_relations1', len(relations1))
    hp.Fixed('num_relations2', len(relations2))

    hp.Fixed('embedding_model1', model1)
    hp.Fixed('embedding_model2', model2)
    hp.Fixed('output_dim', output_dim)

    bs = 1024

    if hp_file1 and hp_file2:
        for i, hp_file in enumerate([hp_file1, hp_file2]):
            with open(hp_file, 'r') as fp:
                data = json.load(fp)
                for k in data:
                    hp.Fixed(k + str(i + 1), data[k])
                    if k == 'batch_size':
                        bs = min(bs, data[k])
    else:
        for i, m in zip(['1', '2'], [model1, model2]):
            hp.Choice('dim' + i, [100, 200, 400], default=200)
            hp.Choice('negative_samples' + i, [10, 100], default=10)
            if m in ['ConvE', 'ConvR', 'ConvKB']:
                bs = 128
            hp.Choice('loss_function' + i, [
                'pairwize_hinge', 'pairwize_logistic', 'pointwize_hinge',
                'pointwize_logistic'
            ],
                      default='pairwize_hinge')
            w = kg_lengths[int(i) - 1] / max(kg_lengths)

    if hp_pred_file:
        with open(hp_pred_file, 'r') as fp:
            data = json.load(fp)
            for k in data:
                hp.Fixed(k, data[k])
    else:
        MAX_LAYERS = 3
        hp.Int('branching_num_layers_chemical', 0, MAX_LAYERS, default=1)
        hp.Int('branching_num_layers_species', 0, MAX_LAYERS, default=1)
        hp.Int('branching_num_layers_conc', 0, MAX_LAYERS, default=1)
        hp.Int('num_layers1', 0, 3, default=1)
        for i in range(MAX_LAYERS + 1):
            hp.Choice('branching_units_chemical_' + str(i + 1), [32, 128, 512],
                      default=128)
            hp.Choice('branching_units_species_' + str(i + 1), [32, 128, 512],
                      default=128)
            hp.Choice('branching_units_conc_' + str(i + 1), [32, 128, 512],
                      default=128)
            hp.Choice('units_' + str(i + 1), [32, 128, 512], default=128)

    # Since inputs are oversampled, we must reduce the weight of losses accordingly.
    w = output_lengths / max(kg_lengths)
    hp.Float('loss_weight1', w, 5 * w, sampling='log')
    hp.Float('loss_weight2', w, 5 * w, sampling='log')
    hp.Float('classification_loss_weight', w, 5 * w, sampling='log')
    hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4, 1e-5])
    hp.Fixed('batch_size', bs)

    m = max(map(len, [kg1, kg2, X_train
                      ])) + (bs - max(map(len, [kg1, kg2, X_train])) % bs)
    Xtr, ytr = prep_data_v2(kg1, kg2, X_train, y_train, max_length=m)
    Xte, yte = prep_data_v2(kg1,
                            kg2,
                            X_test,
                            y_test,
                            test=True,
                            max_length=max(bs, len(y_test)))

    tuner = CVTuner(hypermodel=build_model,
                    oracle=kt.oracles.BayesianOptimization(
                        hyperparameters=hp,
                        objective=Objective('val_auc', 'max'),
                        max_trials=params['MAX_TRIALS']),
                    overwrite=True,
                    project_name='tmp/' + ''.join(
                        random.choice(string.ascii_uppercase + string.digits)
                        for _ in range(11)))

    tuner.search(Xtr,
                 ytr,
                 epochs=params['SEARCH_MAX_EPOCHS'],
                 batch_size=bs,
                 callbacks=[
                     EarlyStopping('loss',
                                   mode='min',
                                   patience=params['PATIENCE'])
                 ],
                 kfolds=params['NUM_FOLDS'],
                 class_weight=params['cw'])

    results = []
    prediction = []
    best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
    model = tuner.hypermodel.build(best_hps)

    out = dict()
    for k in best_hps.values.keys():
        out[k] = best_hps.values[k]
    with open('./sim_hp/%s.json' % hp_pred_file.split('/')[-1].split('_')[0],
              'w') as fp:
        json.dump(out, fp)

    for _ in range(num_runs):
        reset_weights(model)
        model.fit(Xtr,
                  ytr,
                  epochs=params['MAX_EPOCHS'],
                  batch_size=bs,
                  verbose=2,
                  class_weight=params['cw'],
                  callbacks=[
                      EarlyStopping('loss',
                                    mode='min',
                                    patience=params['PATIENCE'])
                  ])
        r = model.evaluate(Xte, yte, verbose=0, batch_size=bs)
        results.append(r)

    W1 = model.get_layer('embedding').get_weights()[0]
    W2 = model.get_layer('embedding_2').get_weights()[0]
    np.save(embedding_file + '_chemical_embeddings.npy', W1)
    np.save(embedding_file + '_chemical_ids.npy',
            np.asarray(zip(entities1, range(len(entities1)))))
    np.save(embedding_file + '_taxonomy_embeddings.npy', W2)
    np.save(embedding_file + '_taxonomy_ids.npy',
            np.asarray(zip(entities2, range(len(entities2)))))

    var = np.var(np.asarray(results), axis=0)
    results = np.mean(np.asarray(results), axis=0)

    df = pd.DataFrame(
        data={
            'metric': model.metrics_names,
            'value': list(results),
            'variance': list(var)
        })
    df.to_csv(results_file)
Ejemplo n.º 14
0
    def build_model(hp: kt.HyperParameters, use_avs_model: bool = True):
        batch_size = config.generation.batch_size if stateful else None
        layer_names = name_generator('layer')

        inputs = {}
        per_stream = {}
        cnn_activation = {'relu': keras.activations.relu,
                          'elu': keras.activations.elu,
                          'mish': tfa.activations.mish}[hp.Choice('cnn_activation', ['relu', 'mish'])]

        cat_cnn_repetition = hp.Int('cat_cnn_repetition', 0, 4)
        cnn_spatial_dropout = hp.Float('spatial_dropout', 0.0, 0.5)
        cat_cnn_filters = hp.Int('cat_cnn_filters', 64, 256, sampling='log')
        reg_cnn_repetition = hp.Int('reg_cnn_repetition', 0, 4)
        reg_cnn_filters = hp.Int('reg_cnn_filters', 64, 256, sampling='log')
        cnn_kernel_size = hp.Choice(f'cnn_kernel_size', ['1', '3', '35', '37', ])

        for col in seq.x_cols:
            if col in seq.categorical_cols:
                shape = None, *seq.shapes[col][2:]
                inputs[col] = layers.Input(batch_size=batch_size, shape=shape, name=col)
                per_stream[col] = inputs[col]
                for _ in range(cat_cnn_repetition):
                    per_stream[col] = forgiving_concatenate(inputs=[
                        layers.Conv1D(filters=cat_cnn_filters,
                                      kernel_size=int(s),
                                      activation=cnn_activation,
                                      padding='causal',
                                      kernel_initializer='lecun_normal',
                                      name=layer_names.__next__())(per_stream[col])
                        for conv_i, s in enumerate(cnn_kernel_size)],
                        axis=-1, name=layer_names.__next__(), )
                    per_stream[col] = layers.BatchNormalization(name=layer_names.__next__(), )(per_stream[col])
                    per_stream[col] = layers.SpatialDropout1D(cnn_spatial_dropout)(per_stream[col])
            if col in seq.regression_cols:
                shape = None, *seq.shapes[col][2:]
                inputs[col] = layers.Input(batch_size=batch_size, shape=shape, name=col)
                per_stream[col] = inputs[col]
                for _ in range(reg_cnn_repetition):
                    per_stream[col] = forgiving_concatenate(inputs=[
                        layers.Conv1D(filters=reg_cnn_filters,
                                      kernel_size=int(s),
                                      activation=cnn_activation,
                                      padding='causal',
                                      kernel_initializer='lecun_normal',
                                      name=layer_names.__next__())(per_stream[col])
                        for conv_i, s in enumerate(cnn_kernel_size)],
                        axis=-1, name=layer_names.__next__(), )
                    per_stream[col] = layers.BatchNormalization(name=layer_names.__next__(), )(per_stream[col])
                    per_stream[col] = layers.SpatialDropout1D(cnn_spatial_dropout)(per_stream[col])

        per_stream_list = list(per_stream.values())
        x = forgiving_concatenate(inputs=per_stream_list, axis=-1, name=layer_names.__next__(), )

        lstm_repetition = hp.Int('lstm_repetition', 0, 4)
        lstm_dropout = hp.Float('lstm_dropout', 0.0, 0.6)
        lstm_l2_regularizer = hp.Choice('lstm_l2_regularizer', [1e-2, 1e-4, 1e-6, 0.0])

        for i in range(lstm_repetition):
            if i > 0:
                x = layers.Dropout(lstm_dropout)(x)
            x = layers.LSTM(hp.Int(f'lstm_{i}_units', 128, 384, sampling='log'), return_sequences=True,
                            stateful=stateful, name=layer_names.__next__(),
                            kernel_regularizer=keras.regularizers.l2(lstm_l2_regularizer), )(x)
            x = layers.BatchNormalization(name=layer_names.__next__(), )(x)

        end_cnn_repetition = hp.Int('end_cnn_repetition', 0, 2)
        end_spatial_dropout = hp.Float('end_spatial_dropout', 0.0, 0.5)
        end_cnn_filters = hp.Int('end_cnn_filters', 128, 384, sampling='log')
        end_cnn_kernel_size = hp.Choice(f'end_cnn_kernel_size', ['1', '3', ])

        for _ in range(end_cnn_repetition):
            x = layers.SpatialDropout1D(end_spatial_dropout)(x)
            x = forgiving_concatenate(inputs=[
                layers.Conv1D(filters=end_cnn_filters,
                              kernel_size=int(s),
                              activation=cnn_activation,
                              padding='causal',
                              kernel_initializer='lecun_normal',
                              name=layer_names.__next__())(x)
                for conv_i, s in enumerate(end_cnn_kernel_size)],
                axis=-1, name=layer_names.__next__(), )
            x = layers.BatchNormalization(name=layer_names.__next__(), )(x)
            x = layers.SpatialDropout1D(end_spatial_dropout)(x)

        outputs = {}
        loss = {}
        for col in seq.y_cols:
            if col in seq.categorical_cols:
                shape = seq.shapes[col][-1]
                outputs[col] = layers.TimeDistributed(layers.Dense(shape, activation='softmax'), name=col)(x)
                loss[col] = keras.losses.CategoricalCrossentropy(
                    label_smoothing=tf.cast(hp.Float('label_smoothing', 0.0, 0.6), 'float32'),
                )  # does not work well with mixed precision and stateful model
            if col in seq.regression_cols:
                shape = seq.shapes[col][-1]
                outputs[col] = layers.TimeDistributed(layers.Dense(shape, activation=None), name=col)(x)
                loss[col] = 'mse'

        if stateful or config.training.AVS_proxy_ratio == 0:
            if config.training.AVS_proxy_ratio == 0:
                logging.log(logging.WARNING, f'Not using AVSModel with superior optimizer due to '
                                             f'{config.training.AVS_proxy_ratio=}.')
            model = Model(inputs=inputs, outputs=outputs)
            opt = keras.optimizers.Adam()
        else:
            model = AVSModel(inputs=inputs, outputs=outputs, config=config)

            decay_start_epoch = hp.Int('decay_start_epoch', 15, 40)
            decay_end_epoch = (decay_start_epoch * 4) // 3
            lr_schedule = FlatCosAnnealSchedule(decay_start=len(seq) * decay_start_epoch,
                                                # Give extra epochs to big batch_size
                                                initial_learning_rate=hp.Choice('initial_learning_rate',
                                                                                [3e-2, 1e-2, 8e-3]),
                                                decay_steps=len(seq) * decay_end_epoch,
                                                alpha=0.001, )
            # Ranger hyper params based on https://github.com/fastai/imagenette/blob/master/2020-01-train.md
            opt = tfa.optimizers.RectifiedAdam(learning_rate=lr_schedule,
                                               beta_1=0.95,
                                               beta_2=0.99,
                                               epsilon=1e-6)
            opt = tfa.optimizers.Lookahead(opt, sync_period=6, slow_step_size=0.5)

        model.compile(
            optimizer=opt,
            loss=loss,
            metrics=metrics.create_metrics((not stateful), config),
        )

        return model
Ejemplo n.º 15
0
    def build(self, hp: kerastuner.HyperParameters) -> keras.Model:
        """Build LSTM model

        Notes:
            This is normally called within a HyperModel context.
        Args:
            hp (:obj:`HyperParameters`): `HyperParameters` instance

        Returns:
            A built/compiled keras model ready for hyperparameter tuning
        """

        # L1/L2 vals
        reg_vals = [0.0, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1]

        # Model Topology

        # Should we multiply the feature embeddings by their averages?
        weighting = hp.Boolean("Feature Weighting")

        # Should we add a dense layer between RNN and output?
        final_dense = hp.Boolean("Final Dense Layer")

        # Feature Embedding Params
        emb_l1 = hp.Choice("Feature Embedding L1", reg_vals)
        emb_l2 = hp.Choice("Feature Embedding L2", reg_vals)

        emb_n = hp.Int("Embedding Dimension",
                       min_value=64,
                       max_value=512,
                       default=64,
                       step=64)

        # Demog Embedding
        demog_emb_n = hp.Int("Demographics Embedding Dimension",
                             min_value=1,
                             max_value=64,
                             default=self.n_demog)

        # Average Embedding Params
        avg_l1 = hp.Choice("Average Embedding L1",
                           reg_vals,
                           parent_name="Feature Weighting",
                           parent_values=[True])
        avg_l2 = hp.Choice("Average Embedding L2",
                           reg_vals,
                           parent_name="Feature Weighting",
                           parent_values=[True])

        # LSTM Params
        lstm_n = hp.Int("LSTM Units",
                        min_value=32,
                        max_value=512,
                        default=32,
                        step=32)
        lstm_dropout = hp.Float("LSTM Dropout",
                                min_value=0.0,
                                max_value=0.9,
                                default=0.4,
                                step=0.01)
        lstm_recurrent_dropout = hp.Float("LSTM Recurrent Dropout",
                                          min_value=0.0,
                                          max_value=0.9,
                                          default=0.4,
                                          step=0.01)
        lstm_l1 = hp.Choice("LSTM weights L1", reg_vals)
        lstm_l2 = hp.Choice("LSTM weights L2", reg_vals)

        # Final dense layer
        dense_n = hp.Int("Dense Units",
                         min_value=2,
                         max_value=128,
                         sampling="log",
                         parent_name="Final Dense Layer",
                         parent_values=[True])
        # Model code
        feat_input = keras.Input(shape=(None, None), ragged=True)
        demog_input = keras.Input(shape=(self.n_demog_bags, ))

        demog_emb = keras.layers.Embedding(
            self.n_demog,
            output_dim=demog_emb_n,
            mask_zero=True,
            name="Demographic_Embeddings")(demog_input)

        demog_avg = keras.layers.Flatten()(demog_emb)

        emb1 = keras.layers.Embedding(
            self.vocab_size,
            output_dim=emb_n,
            embeddings_regularizer=keras.regularizers.l1_l2(emb_l1, emb_l2),
            mask_zero=True,
            name="Feature_Embeddings")(feat_input)

        if weighting:
            emb2 = keras.layers.Embedding(
                self.vocab_size,
                output_dim=1,
                embeddings_regularizer=keras.regularizers.l1_l2(
                    avg_l1, avg_l2),
                mask_zero=True,
                name="Average_Embeddings")(feat_input)

            # Multiplying the code embeddings by their respective weights
            mult = keras.layers.Multiply(name="Embeddings_by_Average")(
                [emb1, emb2])
            avg = keras.layers.Lambda(lambda x: tf.math.reduce_mean(x, axis=2),
                                      name="Averaging")(mult)
        else:
            avg = keras.layers.Lambda(lambda x: tf.math.reduce_mean(x, axis=2),
                                      name="Averaging")(emb1)

        lstm_layer = keras.layers.LSTM(
            lstm_n,
            dropout=lstm_dropout,
            recurrent_dropout=lstm_recurrent_dropout,
            recurrent_regularizer=keras.regularizers.l1_l2(lstm_l1, lstm_l2),
            name="Recurrent")(avg)

        lstm_layer = keras.layers.Concatenate()([lstm_layer, demog_avg])

        if final_dense:
            lstm_layer = keras.layers.Dense(dense_n,
                                            activation="relu",
                                            name="pre_output")(lstm_layer)

        activation_fn = "softmax" if self.n_classes > 2 else "sigmoid"
        output = keras.layers.Dense(
            self.n_classes if self.n_classes > 2 else 1,
            activation=activation_fn,
            name="Output")(lstm_layer)

        model = keras.Model([feat_input, demog_input], output)

        # --- Learning rate and momentum
        # lr = hp.Choice(
        #     "Learning Rate",
        #     [1e-6, 5e-6, 1e-5, 5e-5, 1e-4, 5e-4, 1e-3, 5e-3, 1e-2, 5e-2, 1e-1])
        # momentum = hp.Float("Momentum", min_value=0.0, max_value=0.9, step=0.1)
        # opt = keras.optimizers.SGD(lr, momentum=momentum)
        opt = keras.optimizers.Adam()

        # --- Loss FN
        # NOTE: I was messing around with focal loss here, but I think that's
        # harder to justify and explain in this context
        if self.loss is None:
            if self.n_classes > 2:
                loss_fn = keras.losses.categorical_crossentropy
            else:
                loss_fn = keras.losses.binary_crossentropy
        else:
            loss_fn = self.loss

        model.compile(optimizer=opt, loss=loss_fn, metrics=self.metrics)

        return model
Ejemplo n.º 16
0
    def build(self, hp: kerastuner.HyperParameters) -> keras.Model:
        """Build DAN model

        Notes:
            This is normally called within a HyperModel context.
        Args:
            hp (:obj:`HyperParameters`): `HyperParameters` instance

        Returns:
            A built/compiled keras model ready for hyperparameter tuning
        """

        # L1/L2 vals
        reg_vals = [0.0, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1]

        # --- Model Topology

        # Feature Embedding Params
        emb_l1 = hp.Choice("Feature Embedding L1", reg_vals, default=0.0)
        emb_l2 = hp.Choice("Feature Embedding L2", reg_vals, default=0.0)

        emb_n = hp.Int("Embedding Dimension",
                       min_value=64,
                       max_value=2048,
                       default=1024,
                       step=64)

        emb_dropout = hp.Float("Dropout from Embeddings",
                               min_value=0.0,
                               max_value=0.9,
                               step=0.05,
                               default=0.0)

        final_dropout = hp.Float("Dropout before prediction",
                                 min_value=0.0,
                                 max_value=0.9,
                                 step=0.05,
                                 default=0.5)

        # Final dense layer
        dense_size = hp.Int("Dense Units",
                            min_value=2,
                            max_value=128,
                            sampling="log",
                            default=14)

        # --- Model
        feat_input = keras.Input(shape=(self.input_size, ))

        # Feature Embeddings
        embeddings = keras.layers.Embedding(
            input_dim=self.vocab_size,
            output_dim=emb_n,
            embeddings_regularizer=keras.regularizers.l1_l2(emb_l1, emb_l2),
            mask_zero=True,
            name="Feature_Embeddings")(feat_input)

        dropout_1 = keras.layers.Dropout(rate=emb_dropout)(embeddings)

        # Averaging the embeddings
        embedding_avg = keras.backend.mean(dropout_1, 1)

        # Dense layers
        dense = keras.layers.Dense(dense_size,
                                   activation="relu",
                                   name='dense_1')(embedding_avg)

        dropout_2 = keras.layers.Dropout(final_dropout)(dense)

        activation_fn = "softmax" if self.n_classes > 2 else "sigmoid"

        output = keras.layers.Dense(
            units=self.n_classes if self.n_classes > 2 else 1,
            activation=activation_fn,
            name="Output")(dropout_2)

        model = keras.Model(feat_input, output)

        # --- Learning rate and momentum
        # lr = hp.Choice(
        #     "Learning Rate",
        #     [1e-6, 5e-6, 1e-5, 5e-5, 1e-4, 5e-4, 1e-3, 5e-3, 1e-2, 5e-2, 1e-1])
        # momentum = hp.Float("Momentum", min_value=0.0, max_value=0.9, step=0.1)
        # opt = keras.optimizers.SGD(lr, momentum=momentum)
        # NOTE: I've had a lot of issues with SGD getting even comparable performance to Adam
        # so I'm saying we scrap it and just go with Adam.
        opt = keras.optimizers.Adam()

        # --- Loss FN
        # NOTE: I was messing around with focal loss here, but I think that's
        # harder to justify and explain in this context
        if self.loss is None:
            if self.n_classes > 2:
                loss_fn = keras.losses.categorical_crossentropy
            else:
                loss_fn = keras.losses.binary_crossentropy
        else:
            loss_fn = self.loss
        model.compile(optimizer=opt, loss=loss_fn, metrics=self.metrics)

        return model
def optimize_model(model, kg1, kg2):

    bs = int(256)
    kg1 = pad(kg1, bs)
    kg2 = pad(kg2, bs)
    kg1 = np.asarray(kg1)
    kg2 = np.asarray(kg2)

    embeddings = {}

    model_name = model

    for kg, name in zip([kg1, kg2], ['_chemical', '_taxonomy']):

        N = len(set([s for s, _, _ in kg]) | set([o for _, _, o in kg]))
        M = len(set([p for _, p, _ in kg]))

        hp = HyperParameters()
        hp.Fixed('embedding_model', model_name)
        hp.Fixed('num_entities', value=N)
        hp.Fixed('num_relations', value=M)

        lfs = [
            'pairwize_hinge', 'pairwize_logistic', 'pointwize_hinge',
            'pointwize_logistic'
        ]

        hp.Int('margin', 1, 10, default=1)

        hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4, 1e-5])

        if model in ['ConvE', 'ConvR', 'ConvKB']:
            batch_size = 128
            hp.Fixed('hidden_dp', 0.2)
        else:
            batch_size = bs

        hp.Choice('regularization', [0.0, 0.01, 0.001], default=0.0)
        if model_name in ['TransE', 'HAKE', 'pRotatE', 'RotatE']:
            hp.Int('gamma', 0, 20, default=0)

        hp.Choice('loss_function', lfs)
        hp.Fixed('dp', 0.2)
        hp.Choice('dim', [100, 200, 400], default=200)
        hp.Choice('negative_samples', [10, 100], default=10)
        hp.Fixed('batch_size', batch_size)

        tuner = BayesianOptimization(
            build_model,
            hyperparameters=hp,
            objective=Objective('relative_loss', 'min'),
            max_trials=MAX_TRIALS,
            overwrite=True,
            project_name='tmp/' + ''.join(
                random.choice(string.ascii_uppercase + string.digits)
                for _ in range(11)))

        tuner.search(kg,
                     epochs=SEARCH_MAX_EPOCHS,
                     batch_size=batch_size,
                     callbacks=[
                         ClearTrainingOutput(),
                         MyCallback(kg),
                         TerminateOnNaN(),
                         TimeStopping(SECONDS_PER_TRAIL),
                         EarlyStopping('loss', min_delta=1e-5, patience=3)
                     ],
                     verbose=1)

        tuner.results_summary()

        best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
        model = tuner.hypermodel.build(best_hps)
        out = dict()
        for k in best_hps.values.keys():
            out[k] = best_hps.values[k]
        with open('./pretrained_hp/%s%s_kg.json' % (model_name, name),
                  'w') as fp:
            json.dump(out, fp)

        model.fit(kg,
                  epochs=MAX_EPOCHS,
                  batch_size=batch_size,
                  callbacks=[
                      EarlyStopping('loss', min_delta=1e-5, patience=3),
                      ReduceLROnPlateau('loss', min_delta=1e-5, patience=3)
                  ])
        embeddings[name] = model.entity_embedding.get_weights()[0]

    return embeddings
Ejemplo n.º 18
0
def build_hyper_l2_constrained(hp: HyperParameters,
                               n_tasks: int,
                               all_columns: List[str],
                               cat_features_dim: Dict[str, int],
                               restricted_hyperparameter_search: bool,
                               feature_sparsity_min: int = 4,
                               feature_sparsity_max: int = 9,
                               min_layers: int = 3,
                               max_layers: int = 6,
                               min_units_per_layer: int = 32,
                               max_units_per_layer: int = 64,
                               min_l2_alpha: float = 1e-1,
                               max_l2_alpha: float = 1e+2
                               ) -> Model:
    """
    Build model for L2 constrained multi-task learning model

    Parameters
    ----------
    hp: instance of HyperParameters
        Hyper-Parameters that define architecture and training of neural networks

    n_tasks: int
        Number of tasks

    all_columns: list
        Names of the features

    cat_features_dim: dict
        Dictionary that maps from the name of categorical feature
        to its dimensionality.

    restricted_hyperparameter_search: bool
        If True, then fixes following hyperparameters and does not optimize them.
        - batch_size = 1024
        - hidden_layer_activation = relu
        - optimizer = sgd

    feature_sparsity_min: int
        Minimum possible value of feature sparsity threshold

    feature_sparsity_max: int
        Maximum possible value of feature sparsity threshold

    min_layers: int
        Minimum number of layers

    max_layers: int
        Maximum number of layers

    min_units_per_layer: int
        Minimum number of neurons per layer

    max_units_per_layer: int
        Maximum number of neurons per layer

    min_l2_alpha: float
        Minimum possible value of l2 regularization coefficient

    max_l2_alpha: float
        Maximium possible value of l2 regularization coefficient

    Returns
    -------
    model: tensorflow.keras.models.Model
        Compiled L2 Constrained Model
    """
    # define activation functions and preproceing layer
    build_activation_functions(hp, restricted_hyperparameter_search)
    preprocessing_layer = build_preprocessing_layer_uci_income(hp,
                                                               all_columns,
                                                               cat_features_dim,
                                                               feature_sparsity_min,
                                                               feature_sparsity_max)
    # propagate input through preprocesing layer
    input_layer = Input(shape=(len(all_columns),))
    x = preprocessing_layer(input_layer)

    # build l2 constrained model
    n_layers = hp.Int("number_of_hidden_layers",
                      min_value=min_layers,
                      max_value=max_layers)
    for i in range(n_layers):
        n_units = hp.Int("n_units_layer_{0}".format(i),
                         min_value=min_units_per_layer,
                         max_value=max_units_per_layer)
        mtl_layers = [Dense(n_units, hp['hidden_layer_activation']) for _ in range(n_tasks)]
        l2_regularizer = hp.Float("l2_regularizer_layer_{0}".format(i),
                                  min_value=min_l2_alpha,
                                  max_value=max_l2_alpha)
        constrained_l2 = ConstrainedMTL(mtl_layers, l1_regularizer=0., l2_regualrizer=l2_regularizer)
        x = constrained_l2(x)
    output_layers = [Dense(1, hp['output_layer_activation'])(x[i]) for i in range(n_tasks)]
    model = Model(inputs=input_layer, outputs=output_layers)
    return model
Ejemplo n.º 19
0
def get_tunable_roberta(hp: HyperParameters):
    ids = keras.layers.Input(shape=(Config.Train.max_len, ),
                             dtype=tf.int32,
                             name='ids')
    att = keras.layers.Input(shape=(Config.Train.max_len, ),
                             dtype=tf.int32,
                             name='att')
    tok_type_ids = keras.layers.Input(shape=(Config.Train.max_len, ),
                                      dtype=tf.int32,
                                      name='tti')

    config = RobertaConfig.from_pretrained(Config.Roberta.config)
    roberta_model = TFRobertaModel.from_pretrained(Config.Roberta.model,
                                                   config=config)

    roberta_model.trainable = False

    x = roberta_model(ids, attention_mask=att, token_type_ids=tok_type_ids)

    use_alpha_dropout = False  # hp.Boolean('use_alpha_dropout')
    if use_alpha_dropout:
        x1 = keras.layers.AlphaDropout(hp.Choice('dropout1',
                                                 [0.1, 0.2, 0.3]))(x[0])
        x2 = keras.layers.AlphaDropout(hp.Choice('dropout2',
                                                 [0.1, 0.2, 0.3]))(x[0])
    else:
        x1 = keras.layers.Dropout(hp.Choice('dropout1', [0.1, 0.2, 0.3]))(x[0])
        x2 = keras.layers.Dropout(hp.Choice('dropout2', [0.1, 0.2, 0.3]))(x[0])

    use_rnn = False  # hp.Boolean('use_rnn')
    if use_rnn:
        lstm_count = hp.Choice('rnn_count', [1, 2])
        for i in range(lstm_count):
            x1, state1_0, _, state1_1, _ = keras.layers.Bidirectional(
                keras.layers.LSTM(hp.Int(f'lstm_units1_{i}', 32, 48, step=8),
                                  return_sequences=True,
                                  return_state=True))(x1)
            x1 = keras.layers.LeakyReLU()(x1)
            state1 = keras.layers.concatenate([state1_0, state1_1])
            x1 = keras.layers.Attention()([x1, state1])
            x2, state2_0, _, state2_1, _ = keras.layers.Bidirectional(
                keras.layers.LSTM(hp.Int(f'lstm_units2_{i}', 32, 48, step=8),
                                  return_sequences=True,
                                  return_state=True))(x2)
            x2 = keras.layers.LeakyReLU()(x2)
            state2 = keras.layers.concatenate([state2_0, state2_1])
            x2 = keras.layers.Attention()([x2, state2])
    else:
        conv_count = hp.Choice('conv_count', [1, 2])
        for i in range(conv_count):
            x1 = keras.layers.Conv1D(hp.Int(f'conv_filter1_{i}', 8, 24,
                                            step=8),
                                     hp.Int(f'conv_kernel1_{i}', 3, 5, step=1),
                                     padding='same')(x1)
            x1 = keras.layers.LeakyReLU()(x1)
            x2 = keras.layers.Conv1D(hp.Int(f'conv_filter2_{i}', 8, 24,
                                            step=8),
                                     hp.Int(f'conv_kernel2_{i}', 3, 5, step=1),
                                     padding='same')(x2)
            x2 = keras.layers.LeakyReLU()(x2)

    x1 = keras.layers.Conv1D(1, 1)(x1)
    x1 = keras.layers.Flatten()(x1)
    x1 = keras.layers.Activation('softmax', name='sts')(x1)

    x2 = keras.layers.Conv1D(1, 1)(x2)
    x2 = keras.layers.Flatten()(x2)
    x2 = keras.layers.Activation('softmax', name='ets')(x2)

    model = keras.models.Model(inputs=[ids, att, tok_type_ids],
                               outputs=[x1, x2])
    optimizer = keras.optimizers.Adam(learning_rate=1e-3)
    loss = keras.losses.CategoricalCrossentropy(
        label_smoothing=Config.Train.label_smoothing)
    model.compile(loss=loss, optimizer=optimizer)

    return model