def train_intent_model(int_ds, random_selection: bool, limit_num_sents: bool, num_samples=None): if random_selection: selection = get_intents_selection(int_ds['train'], num_samples) # selected intent labels: (num_samples, ) np.ndarray filt_train = get_filtered_lst(int_ds['train'], selection) # almost the same as int_ds['train'] but filtered according to selection filt_test = get_filtered_lst(int_ds['test'], selection) mod_int_ds = copy.deepcopy(int_ds) # deepcopy in order to not modify the original dict mod_int_ds['train'] = filt_train mod_int_ds['test'] = filt_test dataset = mod_int_ds else: dataset = int_ds split = Split() X_int_train, y_int_train = split.get_X_y(dataset['train'], fit=True, limit_num_sents=limit_num_sents, set_type='train') # fit only on first dataset X_int_test, y_int_test = split.get_X_y(dataset['test'] + dataset['oos_test'], fit=False, limit_num_sents=limit_num_sents, set_type='test') svc_int = svm.SVC(C=1, kernel='linear').fit(X_int_train, y_int_train) return svc_int, X_int_test, y_int_test, split
def train_intent_model(int_ds, random_selection: bool, dim: int, limit_num_sents: bool, num_samples=None): if random_selection: selection = get_intents_selection( int_ds['train'], num_samples) # selected intent labels: (num_samples, ) np.ndarray filt_train = get_filtered_lst( int_ds['train'], selection ) # almost the same as int_ds['train'] but filtered according to selection mod_int_ds = copy.deepcopy( int_ds) # deepcopy in order to not modify the original dict mod_int_ds['train'] = filt_train dataset = mod_int_ds else: dataset = int_ds train_str_int = dataset_2_string(dataset['train'], limit_num_sents=limit_num_sents, set_type='train') X_int_test, y_int_test = get_X_y_fasttext(dataset['test'] + dataset['oos_test'], limit_num_sents=limit_num_sents, set_type='test') with NamedTemporaryFile() as f: f.write(train_str_int.encode('utf8')) f.seek(0) # Train model for in-scope queries model_int = fasttext.train_supervised( input=f.name, dim=dim, pretrainedVectors=f'{PRETRAINED_VECTORS_PATH}/cc.en.{dim}.vec') return model_int, X_int_test, y_int_test
def train_intent_model(int_ds, random_selection: bool, limit_num_sents: bool, num_samples=None): if random_selection: selection = get_intents_selection( int_ds['train'], num_samples) # selected intent labels: (num_samples, ) np.ndarray filt_train = get_filtered_lst( int_ds['train'], selection ) # almost the same as int_ds['train'] but filtered according to selection mod_int_ds = copy.deepcopy( int_ds) # deepcopy in order to not modify the original dict mod_int_ds['train'] = filt_train dataset = mod_int_ds else: dataset = int_ds train_str_int = dataset_2_string_rasa(dataset['train'], limit_num_sents=limit_num_sents, set_type='train') X_int_test, y_int_test = get_X_y_rasa(dataset['test'] + dataset['oos_test'], limit_num_sents=limit_num_sents, set_type='test') with NamedTemporaryFile(suffix='.yml') as f: f.write(train_str_int.encode('utf8')) f.seek(0) training_data = rasa.shared.nlu.training_data.loading.load_data(f.name) config = rasa.nlu.config.load('config.yml') trainer = rasa.nlu.model.Trainer(config) model_int = trainer.train(training_data) return model_int, X_int_test, y_int_test
def train_intent_model(int_ds, random_selection: bool, limit_num_sents: bool, num_samples=None): if random_selection: selection = get_intents_selection( int_ds['train'], num_samples) # selected intent labels: (num_samples, ) np.ndarray filt_train = get_filtered_lst( int_ds['train'], selection ) # almost the same as int_ds['train'] but filtered according to selection filt_test = get_filtered_lst(int_ds['test'], selection) mod_int_ds = copy.deepcopy( int_ds) # deepcopy in order to not modify the original dict mod_int_ds['train'] = filt_train mod_int_ds['test'] = filt_test dataset = mod_int_ds else: dataset = int_ds # Split and tokenize dataset split = Split_BERT() tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') X_int_train, y_int_train = split.get_X_y(dataset['train'], limit_num_sents=limit_num_sents, set_type='train') X_int_val, y_int_val = split.get_X_y(dataset['val'], limit_num_sents=limit_num_sents, set_type='val') X_int_test, y_int_test = split.get_X_y(dataset['test'] + dataset['oos_test'], limit_num_sents=limit_num_sents, set_type='test') train_int_ids, train_int_attention_masks, train_int_labels = tokenize_BERT( X_int_train, y_int_train, tokenizer) val_int_ids, val_int_attention_masks, val_int_labels = tokenize_BERT( X_int_val, y_int_val, tokenizer) num_labels = len(split.intents_dct.keys( )) - 1 # minus 1 because 'oos' label isn't used in training # Train model model_int = TFBertForSequenceClassification.from_pretrained( 'bert-base-uncased', num_labels=num_labels) # we have to adjust the number of labels print('\nBert Model', model_int.summary()) log_dir = 'tensorboard_data/tb_bert' model_save_path = './models/bert_model.h5' callbacks = [ tf.keras.callbacks.ModelCheckpoint(filepath=model_save_path, save_weights_only=True, monitor='val_loss', mode='min', save_best_only=True), tf.keras.callbacks.TensorBoard(log_dir=log_dir) ] loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy') optimizer = tf.keras.optimizers.Adam(learning_rate=4e-5) model_int.compile(loss=loss, optimizer=optimizer, metrics=[metric]) history = model_int.fit( [train_int_ids, train_int_attention_masks], train_int_labels, batch_size=32, epochs=5, validation_data=([val_int_ids, val_int_attention_masks], val_int_labels), callbacks=callbacks) return model_int, X_int_test, y_int_test, split
results_dct = evaluate(int_ds, LIMIT_NUM_SENTS) print_results(dataset_size, results_dct) else: for num_samples in [3, 6, 9, 12]: # choose only a certain number of samples print( f'{repetitions} times random selection {num_samples} intents' ) accuracy_lst, recall_lst = [], [] far_lst, frr_lst = [], [] for i in range(repetitions): selection = get_intents_selection( int_ds['train'], num_samples ) # selected intent labels: (num_samples, ) np.ndarray filt_train = get_filtered_lst( int_ds['train'], selection ) # almost the same as int_ds['train'] but filtered according to selection filt_test = get_filtered_lst(int_ds['test'], selection) mod_int_ds = copy.deepcopy( int_ds ) # deepcopy in order to not modify the original dict mod_int_ds['train'] = filt_train mod_int_ds['test'] = filt_test temp_res = evaluate(mod_int_ds, LIMIT_NUM_SENTS) # temporary results