def train_intent_model(int_ds, random_selection: bool, limit_num_sents: bool, num_samples=None):
    if random_selection:
        selection = get_intents_selection(int_ds['train'],
                                          num_samples)  # selected intent labels: (num_samples, ) np.ndarray

        filt_train = get_filtered_lst(int_ds['train'],
                                      selection)  # almost the same as int_ds['train'] but filtered according to selection
        filt_test = get_filtered_lst(int_ds['test'], selection)

        mod_int_ds = copy.deepcopy(int_ds)  # deepcopy in order to not modify the original dict
        mod_int_ds['train'] = filt_train
        mod_int_ds['test'] = filt_test

        dataset = mod_int_ds
    else:
        dataset = int_ds

    split = Split()

    X_int_train, y_int_train = split.get_X_y(dataset['train'], fit=True, limit_num_sents=limit_num_sents,
                                             set_type='train')  # fit only on first dataset
    X_int_test, y_int_test = split.get_X_y(dataset['test'] + dataset['oos_test'], fit=False,
                                           limit_num_sents=limit_num_sents, set_type='test')

    svc_int = svm.SVC(C=1, kernel='linear').fit(X_int_train, y_int_train)

    return svc_int, X_int_test, y_int_test, split
def train_intent_model(int_ds,
                       random_selection: bool,
                       dim: int,
                       limit_num_sents: bool,
                       num_samples=None):
    if random_selection:
        selection = get_intents_selection(
            int_ds['train'],
            num_samples)  # selected intent labels: (num_samples, ) np.ndarray

        filt_train = get_filtered_lst(
            int_ds['train'], selection
        )  # almost the same as int_ds['train'] but filtered according to selection

        mod_int_ds = copy.deepcopy(
            int_ds)  # deepcopy in order to not modify the original dict
        mod_int_ds['train'] = filt_train

        dataset = mod_int_ds
    else:
        dataset = int_ds

    train_str_int = dataset_2_string(dataset['train'],
                                     limit_num_sents=limit_num_sents,
                                     set_type='train')
    X_int_test, y_int_test = get_X_y_fasttext(dataset['test'] +
                                              dataset['oos_test'],
                                              limit_num_sents=limit_num_sents,
                                              set_type='test')

    with NamedTemporaryFile() as f:
        f.write(train_str_int.encode('utf8'))
        f.seek(0)

        # Train model for in-scope queries
        model_int = fasttext.train_supervised(
            input=f.name,
            dim=dim,
            pretrainedVectors=f'{PRETRAINED_VECTORS_PATH}/cc.en.{dim}.vec')

    return model_int, X_int_test, y_int_test
def train_intent_model(int_ds,
                       random_selection: bool,
                       limit_num_sents: bool,
                       num_samples=None):
    if random_selection:
        selection = get_intents_selection(
            int_ds['train'],
            num_samples)  # selected intent labels: (num_samples, ) np.ndarray

        filt_train = get_filtered_lst(
            int_ds['train'], selection
        )  # almost the same as int_ds['train'] but filtered according to selection

        mod_int_ds = copy.deepcopy(
            int_ds)  # deepcopy in order to not modify the original dict
        mod_int_ds['train'] = filt_train

        dataset = mod_int_ds
    else:
        dataset = int_ds

    train_str_int = dataset_2_string_rasa(dataset['train'],
                                          limit_num_sents=limit_num_sents,
                                          set_type='train')
    X_int_test, y_int_test = get_X_y_rasa(dataset['test'] +
                                          dataset['oos_test'],
                                          limit_num_sents=limit_num_sents,
                                          set_type='test')

    with NamedTemporaryFile(suffix='.yml') as f:
        f.write(train_str_int.encode('utf8'))
        f.seek(0)

        training_data = rasa.shared.nlu.training_data.loading.load_data(f.name)

    config = rasa.nlu.config.load('config.yml')
    trainer = rasa.nlu.model.Trainer(config)
    model_int = trainer.train(training_data)

    return model_int, X_int_test, y_int_test
Example #4
0
def train_intent_model(int_ds,
                       random_selection: bool,
                       limit_num_sents: bool,
                       num_samples=None):
    if random_selection:
        selection = get_intents_selection(
            int_ds['train'],
            num_samples)  # selected intent labels: (num_samples, ) np.ndarray

        filt_train = get_filtered_lst(
            int_ds['train'], selection
        )  # almost the same as int_ds['train'] but filtered according to selection
        filt_test = get_filtered_lst(int_ds['test'], selection)

        mod_int_ds = copy.deepcopy(
            int_ds)  # deepcopy in order to not modify the original dict
        mod_int_ds['train'] = filt_train
        mod_int_ds['test'] = filt_test

        dataset = mod_int_ds
    else:
        dataset = int_ds

    # Split and tokenize dataset
    split = Split_BERT()
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    X_int_train, y_int_train = split.get_X_y(dataset['train'],
                                             limit_num_sents=limit_num_sents,
                                             set_type='train')
    X_int_val, y_int_val = split.get_X_y(dataset['val'],
                                         limit_num_sents=limit_num_sents,
                                         set_type='val')
    X_int_test, y_int_test = split.get_X_y(dataset['test'] +
                                           dataset['oos_test'],
                                           limit_num_sents=limit_num_sents,
                                           set_type='test')

    train_int_ids, train_int_attention_masks, train_int_labels = tokenize_BERT(
        X_int_train, y_int_train, tokenizer)
    val_int_ids, val_int_attention_masks, val_int_labels = tokenize_BERT(
        X_int_val, y_int_val, tokenizer)

    num_labels = len(split.intents_dct.keys(
    )) - 1  # minus 1 because 'oos' label isn't used in training

    # Train model
    model_int = TFBertForSequenceClassification.from_pretrained(
        'bert-base-uncased',
        num_labels=num_labels)  # we have to adjust the number of labels
    print('\nBert Model', model_int.summary())

    log_dir = 'tensorboard_data/tb_bert'
    model_save_path = './models/bert_model.h5'

    callbacks = [
        tf.keras.callbacks.ModelCheckpoint(filepath=model_save_path,
                                           save_weights_only=True,
                                           monitor='val_loss',
                                           mode='min',
                                           save_best_only=True),
        tf.keras.callbacks.TensorBoard(log_dir=log_dir)
    ]

    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
    optimizer = tf.keras.optimizers.Adam(learning_rate=4e-5)

    model_int.compile(loss=loss, optimizer=optimizer, metrics=[metric])

    history = model_int.fit(
        [train_int_ids, train_int_attention_masks],
        train_int_labels,
        batch_size=32,
        epochs=5,
        validation_data=([val_int_ids,
                          val_int_attention_masks], val_int_labels),
        callbacks=callbacks)

    return model_int, X_int_test, y_int_test, split
Example #5
0
            results_dct = evaluate(int_ds, LIMIT_NUM_SENTS)

            print_results(dataset_size, results_dct)
        else:
            for num_samples in [3, 6, 9,
                                12]:  # choose only a certain number of samples
                print(
                    f'{repetitions} times random selection {num_samples} intents'
                )

                accuracy_lst, recall_lst = [], []
                far_lst, frr_lst = [], []

                for i in range(repetitions):
                    selection = get_intents_selection(
                        int_ds['train'], num_samples
                    )  # selected intent labels: (num_samples, ) np.ndarray

                    filt_train = get_filtered_lst(
                        int_ds['train'], selection
                    )  # almost the same as int_ds['train'] but filtered according to selection
                    filt_test = get_filtered_lst(int_ds['test'], selection)

                    mod_int_ds = copy.deepcopy(
                        int_ds
                    )  # deepcopy in order to not modify the original dict
                    mod_int_ds['train'] = filt_train
                    mod_int_ds['test'] = filt_test

                    temp_res = evaluate(mod_int_ds,
                                        LIMIT_NUM_SENTS)  # temporary results