Esempio n. 1
0
    def make_prediction(self, raw_text, nn_type):
        resultado = {}

        # Genero el embedding asociado al texto plano
        embedding = preprocessing.get_embedding_from_sentence(raw_text, self.embeddings)

        # Obtengo el tipo de clasificador solicitado
        clasificador_reacciones = self.clasificadores_reacciones[nn_type]

        # Obtengo la predicción de la REACCION
        softmax_prediction_reaccion = np.reshape(clasificador_reacciones.predict(embedding), (4,))
        # softmax_prediction_reaccion = np.reshape(self.clf_reacciones.predict(embedding), (4,))
        onehot_prediction_reaccion = preprocessing.one_hot(softmax_prediction_reaccion)
        index_reaccion = int(np.argmax(onehot_prediction_reaccion))

        # Obtengo el clasificador asociado a la reacción predicha
        clasificador_conductas = self.categorias[index_reaccion]['clasificador']

        # Obtengo la predicción de la CONDUCTA
        softmax_prediction_conducta = np.reshape(clasificador_conductas.predict(embedding), (3,))
        onehot_prediction_conducta = preprocessing.one_hot(softmax_prediction_conducta)
        index_conducta = int(np.argmax(onehot_prediction_conducta))

        reaccion = self.categorias[index_reaccion]['reaccion']

        resultado['categoria'] = reaccion
        resultado['conducta'] = self.categorias[index_reaccion]['conductas'][index_conducta]
        return resultado
def main():

    print("** loading training data...")
    df = pd.read_csv(r'data/kddcup.data', names=attribute_names)
    df = preprocessing.one_hot(df)
    df = preprocessing.map2major5(df)
    print("** training data loaded and processed")

    y = df["attack_type"].values
    X = df[features].values

    dt = DecisionTreeClassifier(criterion='entropy',
                                splitter='random',
                                max_depth=15,
                                min_samples_leaf=6)
    dt = dt.fit(X, y)

    x_rf = dt.predict(X)

    print("** training set accuracy (PCC) --> ",
          round(accuracy_score(y, x_rf) * 100, 2), "%")

    print("** loading testing data...")
    df = pd.read_csv(r'data/kddcup.data.corrected',
                     header=None,
                     names=attribute_names)
    df = preprocessing.one_hot(df)
    df = preprocessing.map2major5(df)
    print("** testing data loaded and processed")

    X = df[features].values
    y = df['attack_type'].values
    y_rf = dt.predict(X)

    cm = confusion_matrix(y, y_rf)

    arr = [[0 for _ in range(5)] for _ in range(5)]

    for v, c in df['attack_type'].value_counts().items():
        for i in range(len(cm[v])):
            a = round(cm[v][i] / c * 100, 2)
            ab = str(a) + '%'
            arr[v][i] = ab

    print("** confusion matrix:")
    for s in arr:
        print(*s)

    print("** testing set accuracy (PCC) --> ",
          round(accuracy_score(y, y_rf) * 100, 2), "%")
Esempio n. 3
0
def data_genertor():
    args = get_arguments()
    frame_len = args.frame_len
    frame_step = args.frame_step
    while True:

        for fullpath in glob.iglob(args.speech_file):
            fs_signal, signal_sound_data = manipulate.wavread(fullpath)
            signal_sound = AudioSegment.from_file(fullpath)

            for fullpath_noise in glob.iglob(args.noise_file):
                fs_noise, noise_sound_data = manipulate.wavread(fullpath_noise)
                noise_sound = AudioSegment.from_file(fullpath_noise)

                SNR = np.random.randint(args.min_snr, args.max_snr)
                dB = signal_sound.dBFS - noise_sound.dBFS - SNR
                noise_sound += dB  # adjust dB for noise relative to sound
                noise_sound_data = noise_sound.get_array_of_samples()

                rand_start = np.random.randint(
                    len(noise_sound_data) - len(signal_sound_data))
                # check the lenght of signal and noise , assume len(noise) > len(signal)

                combined = signal_sound_data + noise_sound_data[
                    rand_start:rand_start + len(signal_sound_data)]
                noisy_data = combined.astype(np.int16)

                # nosrmalized data [0,1]
                noisy_data_norm = manipulate.normalize(noisy_data)
                signal_sound_data_norm = manipulate.normalize(
                    signal_sound_data)

                framed_noisy = frame.framesig(noisy_data_norm, frame_len,
                                              frame_step)
                framed_clean = frame.framesig(signal_sound_data_norm,
                                              frame_len, frame_step)

                #in_out =np.hstack((framed_noisy, framed_clean))
                #np.random.shuffle(in_out)
                #X_train = in_out[:,:frame_len]
                #audio = in_out[:,frame_len + frame_len/2]
                X_train = framed_noisy
                audio = framed_clean[:, frame_len / 2]

                ulaw_audio = frame.ulaw(audio)
                digit_audio = frame.float_to_uint8(ulaw_audio)
                Y_train = frame.one_hot(digit_audio)

                yield X_train, Y_train  # yield
import sys
sys.path.append('..')
import numpy as np
import minst as Minst
import newtowlayernet as Newtwolayernet
import preprocessing as Preprocessing
import random

import matplotlib.pylab as plt

#读入数据
train_images = Preprocessing.normalize(Minst.get_train_images())
train_lables = Preprocessing.one_hot(Minst.get_train_lables())
test_images = Preprocessing.normalize(Minst.get_test_images())
test_lables = Preprocessing.one_hot(Minst.get_test_lables())

#超参数
iters_num = 1000
train_size = train_images.shape[0]
test_size = test_images.shape[0]
batch_size = 10
learning_rate = 0.002
train_loss_list = []
train_acc_list = []
test_acc_list = []
#存放梯度 监控
w1_list = []
b1_list = []
w2_list = []
b2_list = []
'''
def run():
    with tf.Graph().as_default() as graph:
        tf.logging.set_verbosity(tf.logging.INFO)

        #===================TRAINING BRANCH=======================
        #Load the files into one input queue
        images = tf.convert_to_tensor(image_files)
        annotations = tf.convert_to_tensor(annotation_files)
        input_queue = tf.train.slice_input_producer(
            [images,
             annotations])  #Slice_input producer shuffles the data by default.

        #Decode the image and annotation raw content
        filename = input_queue[0]
        image = tf.read_file(input_queue[0])
        image = tf.image.decode_image(image, channels=3)
        annotation = tf.read_file(input_queue[1])
        annotation = tf.image.decode_image(annotation)

        #preprocess the images and annotations
        preprocessed_image, preprocessed_annotation = preprocess_ori(
            image, annotation, image_height, image_width)

        #Batch up the images and annotations
        images, annotations = tf.train.batch(
            [preprocessed_image, preprocessed_annotation],
            batch_size=batch_size,
            allow_smaller_final_batch=True)

        #Create the model inference
        with slim.arg_scope(ENet_arg_scope(weight_decay=weight_decay)):
            logits, probabilities = ENet(images,
                                         num_classes,
                                         batch_size=batch_size,
                                         is_training=True,
                                         reuse=None,
                                         num_initial_blocks=num_initial_blocks,
                                         stage_two_repeat=stage_two_repeat,
                                         skip_connections=skip_connections)

        #perform one-hot-encoding on the ground truth annotation to get same shape as the logits
        annotations = tf.reshape(annotations,
                                 shape=[batch_size, image_height, image_width])
        annotations_ohe = one_hot(annotations, batch_size, dataset)

        #Actually compute the loss
        loss = weighted_cross_entropy(logits=logits,
                                      onehot_labels=annotations_ohe,
                                      class_weights=class_weights)
        total_loss = tf.losses.get_total_loss()

        #Create the global step for monitoring the learning_rate and training.
        global_step = get_or_create_global_step()

        #Define your exponentially decaying learning rate
        lr = tf.train.exponential_decay(learning_rate=initial_learning_rate,
                                        global_step=global_step,
                                        decay_steps=decay_steps,
                                        decay_rate=learning_rate_decay_factor,
                                        staircase=True)

        #Now we can define the optimizer that takes on the learning rate
        optimizer = tf.train.AdamOptimizer(learning_rate=lr, epsilon=epsilon)

        #Create the train_op.
        train_op = slim.learning.create_train_op(total_loss, optimizer)

        #State the metrics that you want to predict. We get a predictions that is not one_hot_encoded.
        predictions = tf.argmax(probabilities, -1)
        accuracy, accuracy_update = tf.contrib.metrics.streaming_accuracy(
            predictions, annotations)
        mean_IOU, mean_IOU_update = tf.contrib.metrics.streaming_mean_iou(
            predictions=predictions,
            labels=annotations,
            num_classes=num_classes)
        metrics_op = tf.group(accuracy_update, mean_IOU_update)

        #Now we need to create a training step function that runs both the train_op, metrics_op and updates the global_step concurrently.
        def train_step(sess, train_op, global_step, metrics_op):
            '''
            Simply runs a session for the three arguments provided and gives a logging on the time elapsed for each global step
            '''
            #Check the time for each sess run
            start_time = time.time()
            total_loss, global_step_count, accuracy_val, mean_IOU_val, _ = sess.run(
                [train_op, global_step, accuracy, mean_IOU, metrics_op])
            time_elapsed = time.time() - start_time

            #Run the logging to show some results
            logging.info(
                'global step %s: loss: %.4f (%.2f sec/step)(%.2f fps)    Current Streaming Accuracy: %.4f    Current Mean IOU: %.4f',
                global_step_count, total_loss, time_elapsed / batch_size,
                batch_size / time_elapsed, accuracy_val, mean_IOU_val)

            return total_loss, accuracy_val, mean_IOU_val

        #================VALIDATION BRANCH========================
        #Load the files into one input queue
        images_val = tf.convert_to_tensor(image_val_files)
        annotations_val = tf.convert_to_tensor(annotation_val_files)
        input_queue_val = tf.train.slice_input_producer(
            [images_val, annotations_val])

        #Decode the image and annotation raw content
        filename_val = input_queue_val[0]
        image_val = tf.read_file(input_queue_val[0])
        image_val = tf.image.decode_jpeg(image_val, channels=3)
        annotation_val = tf.read_file(input_queue_val[1])
        annotation_val = tf.image.decode_png(annotation_val)

        #preprocess the images and annotations
        preprocessed_image_val, preprocessed_annotation_val = preprocess_ori(
            image_val, annotation_val, image_height, image_width)

        #Batch up the image and annotation
        images_val, annotations_val, filenames_val = tf.train.batch(
            [
                preprocessed_image_val, preprocessed_annotation_val,
                filename_val
            ],
            batch_size=eval_batch_size,
            allow_smaller_final_batch=True)

        with slim.arg_scope(ENet_arg_scope(weight_decay=weight_decay)):
            logits_val, probabilities_val = ENet(
                images_val,
                num_classes,
                batch_size=eval_batch_size,
                is_training=True,
                reuse=True,
                num_initial_blocks=num_initial_blocks,
                stage_two_repeat=stage_two_repeat,
                skip_connections=skip_connections)

        #perform one-hot-encoding on the ground truth annotation to get same shape as the logits
        annotations_val = tf.reshape(
            annotations_val,
            shape=[eval_batch_size, image_height, image_width])
        annotations_ohe_val = one_hot(annotations_val, batch_size, dataset)

        #State the metrics that you want to predict. We get a predictions that is not one_hot_encoded. ----> Should we use OHE instead?
        predictions_val = tf.argmax(probabilities_val, -1)
        accuracy_val, accuracy_val_update = tf.contrib.metrics.streaming_accuracy(
            predictions_val, annotations_val)
        mean_IOU_val, mean_IOU_val_update = tf.contrib.metrics.streaming_mean_iou(
            predictions=predictions_val,
            labels=annotations_val,
            num_classes=num_classes)
        metrics_op_val = tf.group(accuracy_val_update, mean_IOU_val_update)

        #Create an output for showing the segmentation output of validation images
        segmentation_output_val = tf.cast(predictions_val, dtype=tf.float32)
        segmentation_output_val = tf.reshape(
            segmentation_output_val, shape=[-1, image_height, image_width, 1])
        segmentation_ground_truth_val = tf.cast(annotations_val,
                                                dtype=tf.float32)
        segmentation_ground_truth_val = tf.reshape(
            segmentation_ground_truth_val,
            shape=[-1, image_height, image_width, 1])

        def eval_step(sess, metrics_op):
            '''
            Simply takes in a session, runs the metrics op and some logging information.
            '''
            start_time = time.time()
            _, accuracy_value, mean_IOU_value = sess.run(
                [metrics_op, accuracy_val, mean_IOU_val])
            time_elapsed = time.time() - start_time

            #Log some information
            logging.info(
                '---VALIDATION--- Validation Accuracy: %.4f    Validation Mean IOU: %.4f    (%.2f sec/step)(%.2f fps)',
                accuracy_value, mean_IOU_value, time_elapsed / eval_batch_size,
                eval_batch_size / time_elapsed)

            return accuracy_value, mean_IOU_value

        #=====================================================

        #Now finally create all the summaries you need to monitor and group them into one summary op.
        tf.summary.scalar('Monitor/Total_Loss', total_loss)
        tf.summary.scalar('Monitor/validation_accuracy', accuracy_val)
        tf.summary.scalar('Monitor/training_accuracy', accuracy)
        tf.summary.scalar('Monitor/validation_mean_IOU', mean_IOU_val)
        tf.summary.scalar('Monitor/training_mean_IOU', mean_IOU)
        tf.summary.scalar('Monitor/learning_rate', lr)
        tf.summary.image('Images/Validation_original_image',
                         images_val,
                         max_outputs=1)
        tf.summary.image('Images/Validation_segmentation_output',
                         segmentation_output_val,
                         max_outputs=1)
        tf.summary.image('Images/Validation_segmentation_ground_truth',
                         segmentation_ground_truth_val,
                         max_outputs=1)
        my_summary_op = tf.summary.merge_all()

        #Define your supervisor for running a managed session. Do not run the summary_op automatically or else it will consume too much memory
        sv = tf.train.Supervisor(logdir=logdir, summary_op=None, init_fn=None)

        # Run the managed session
        with sv.managed_session() as sess:

            for step in xrange(int(num_steps_per_epoch * num_epochs)):
                #At the start of every epoch, show the vital information:
                if step % num_batches_per_epoch == 0:
                    logging.info('Epoch %s/%s',
                                 step / num_batches_per_epoch + 1, num_epochs)
                    learning_rate_value = sess.run([lr])
                    logging.info('Current Learning Rate: %s',
                                 learning_rate_value)

                #Log the summaries every 10 steps or every end of epoch, which ever lower.
                if step % min(num_steps_per_epoch, 10) == 0:
                    loss, training_accuracy, training_mean_IOU = train_step(
                        sess, train_op, sv.global_step, metrics_op=metrics_op)

                    #Check the validation data only at every third of an epoch
                    num_to_val = num_steps_per_epoch / 3
                    if step % num_to_val == 0:
                        for i in xrange(
                                len(image_val_files) / eval_batch_size):
                            validation_accuracy, validation_mean_IOU = eval_step(
                                sess, metrics_op_val)

                    summaries = sess.run(my_summary_op)
                    sv.summary_computed(sess, summaries)

                #If not, simply run the training step
                else:
                    loss, training_accuracy, training_mean_IOU = train_step(
                        sess, train_op, sv.global_step, metrics_op=metrics_op)

            #We log the final training loss
            logging.info('Final Loss: %s', loss)
            logging.info('Final Training Accuracy: %s', training_accuracy)
            logging.info('Final Training Mean IOU: %s', training_mean_IOU)
            logging.info('Final Validation Accuracy: %s', validation_accuracy)
            logging.info('Final Validation Mean IOU: %s', validation_mean_IOU)

            #Once all the training has been done, save the log files and checkpoint model
            logging.info('Finished training! Saving model to disk now.')
            sv.saver.save(sess, sv.save_path, global_step=sv.global_step)

            if save_images:
                if not os.path.exists(photo_dir):
                    os.mkdir(photo_dir)

                #Plot the predictions - check validation images only
                logging.info('Total Steps: %d',
                             len(image_val_files) / eval_batch_size)
                logging.info('Saving the images now...')
                for step in xrange(len(image_val_files) / eval_batch_size):
                    start_time = time.time()
                    predictions_value, filenames_value = sess.run(
                        [predictions_val, filenames_val])
                    time_elapsed = time.time() - start_time
                    logging.info('step %d  %.2f(sec/step)  %.2f (fps)', step,
                                 time_elapsed / eval_batch_size,
                                 eval_batch_size / time_elapsed)

                    for i in xrange(eval_batch_size):
                        segmentation = produce_color_segmentation(
                            predictions_value[i], image_height, image_width,
                            dataset)
                        filename = filenames_value[i].split('/')
                        filename = filename[len(filename) - 1]
                        filename = photo_dir + "/trainResult_" + filename
                        cv2.imwrite(filename, segmentation)

            print time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
def run():
    with tf.Graph().as_default() as graph:
        tf.logging.set_verbosity(tf.logging.INFO)

        #===================TEST BRANCH=======================
        #Load the files into one input queue
        images = tf.convert_to_tensor(image_files)
        annotations = tf.convert_to_tensor(annotation_files)
        input_queue = tf.train.slice_input_producer([images, annotations],
                                                    shuffle=False)

        #Decode the image and annotation raw content
        filename = input_queue[0]
        image = tf.read_file(input_queue[0])
        image = tf.image.decode_image(image, channels=3)
        annotation = tf.read_file(input_queue[1])
        annotation = tf.image.decode_image(annotation)

        #preprocess and batch up the image and annotation
        preprocessed_image, preprocessed_annotation = preprocess_ori(
            image, annotation, image_height, image_width)
        images, annotations, filenames = tf.train.batch(
            [preprocessed_image, preprocessed_annotation, filename],
            batch_size=batch_size,
            allow_smaller_final_batch=True)

        #Create the model inference
        with slim.arg_scope(ENet_arg_scope()):
            logits, probabilities = ENet(images,
                                         num_classes,
                                         batch_size=batch_size,
                                         is_training=True,
                                         reuse=None,
                                         num_initial_blocks=num_initial_blocks,
                                         stage_two_repeat=stage_two_repeat,
                                         skip_connections=skip_connections)

        # Set up the variables to restore and restoring function from a saver.
        exclude = []
        variables_to_restore = slim.get_variables_to_restore(exclude=exclude)

        saver = tf.train.Saver(variables_to_restore)

        def restore_fn(sess):
            return saver.restore(sess, checkpoint_file)

        #perform one-hot-encoding on the ground truth annotation to get same shape as the logits
        annotations = tf.reshape(annotations,
                                 shape=[batch_size, image_height, image_width])
        annotations_ohe = one_hot(annotations, batch_size, dataset)

        #State the metrics that you want to predict. We get a predictions that is not one_hot_encoded.
        predictions = tf.argmax(probabilities, -1)
        accuracy, accuracy_update = tf.contrib.metrics.streaming_accuracy(
            predictions, annotations)
        mean_IOU, mean_IOU_update = tf.contrib.metrics.streaming_mean_iou(
            predictions=predictions,
            labels=annotations,
            num_classes=num_classes)
        per_class_accuracy, per_class_accuracy_update = tf.metrics.mean_per_class_accuracy(
            labels=annotations,
            predictions=predictions,
            num_classes=num_classes)
        metrics_op = tf.group(accuracy_update, mean_IOU_update,
                              per_class_accuracy_update)

        #Create the global step and an increment op for monitoring
        global_step = get_or_create_global_step()
        global_step_op = tf.assign(
            global_step, global_step + 1
        )  #no apply_gradient method so manually increasing the global_step

        #Create a evaluation step function
        def eval_step(sess, metrics_op, global_step):
            '''
            Simply takes in a session, runs the metrics op and some logging information.
            '''
            _, global_step_count, accuracy_value, mean_IOU_value, per_class_accuracy_value = sess.run(
                [
                    metrics_op, global_step_op, accuracy, mean_IOU,
                    per_class_accuracy
                ])

            start_time = time.time()
            predictions_val, filename_val = sess.run([predictions, filenames])
            time_elapsed = time.time() - start_time

            #Log some information
            logging.info(
                'Global Step %s: Streaming Accuracy: %.4f     Streaming Mean IOU: %.4f     Per-class Accuracy: %.4f    %.2f(sec/step)  %.2f (fps)',
                global_step_count, accuracy_value, mean_IOU_value,
                per_class_accuracy_value, time_elapsed / batch_size,
                batch_size / time_elapsed)

            #Save the images
            if save_images:
                if not os.path.exists(photo_dir):
                    os.mkdir(photo_dir)

                #Segmentation
                for i in xrange(batch_size):
                    segmentation = produce_color_segmentation(
                        predictions_val[i], image_height, image_width, dataset)
                    filename = filename_val[i].split('/')
                    filename = filename[len(filename) - 1]
                    filename = photo_dir + "/trainResult_" + filename
                    cv2.imwrite(filename, segmentation)

            return accuracy_value, mean_IOU_value, per_class_accuracy_value, time_elapsed

        #Create your summaries
        tf.summary.scalar('Monitor/test_accuracy', accuracy)
        tf.summary.scalar('Monitor/test_mean_per_class_accuracy',
                          per_class_accuracy)
        tf.summary.scalar('Monitor/test_mean_IOU', mean_IOU)
        my_summary_op = tf.summary.merge_all()

        #Define your supervisor for running a managed session. Do not run the summary_op automatically or else it will consume too much memory
        sv = tf.train.Supervisor(logdir=logdir,
                                 summary_op=None,
                                 init_fn=restore_fn)

        #Run the managed session
        with sv.managed_session() as sess:

            total_time = 0
            for step in range(int(num_steps_per_epoch)):
                #Compute summaries every 10 steps and continue evaluating
                if step % 10 == 0:
                    test_accuracy, test_mean_IOU, test_per_class_accuracy, time_elapsed = eval_step(
                        sess,
                        metrics_op=metrics_op,
                        global_step=sv.global_step)
                    summaries = sess.run(my_summary_op)
                    sv.summary_computed(sess, summaries)

                #Otherwise just run as per normal
                else:
                    test_accuracy, test_mean_IOU, test_per_class_accuracy, time_elapsed = eval_step(
                        sess,
                        metrics_op=metrics_op,
                        global_step=sv.global_step)

                total_time = total_time + time_elapsed

            #At the end of all the evaluation, show the final accuracy
            logging.info('Final Streaming Accuracy: %.4f', test_accuracy)
            logging.info('Final Mean IOU: %.4f', test_mean_IOU)
            logging.info('Final Per Class Accuracy: %.4f',
                         test_per_class_accuracy)
            logging.info('Average Speed: %.4f fps',
                         batch_size * (num_steps_per_epoch - 1) / total_time)

            #Show end of evaluation
            logging.info('Finished evaluating!')
Esempio n. 7
0
from model import CharCnn
from config import FLAGS
import preprocessing

if __name__ == '__main__':

    x_train, y_train, x_test, y_test = preprocessing.load_data(
        FLAGS.train_dir, FLAGS.test_dir)

    x_train_idx = preprocessing.convert_str2idx(x_train)
    x_test_idx = preprocessing.convert_str2idx(x_test)

    y_train = preprocessing.one_hot(y_train)
    y_test = preprocessing.one_hot(y_test)

    char_cnn = CharCnn(sequence_length=300,
                       num_char=70,
                       batch_size=128,
                       iteration=50,
                       init_lr=0.001,
                       n_class=6,
                       embedding_size=100,
                       num_filter=128,
                       filter_size=(7, 7, 3, 3, 3, 3),
                       hidden_unit=1024,
                       step_size=2000,
                       decay=0.9)
    char_cnn.train(x_train_idx, y_train)
    char_cnn.test(x_test_idx, y_test)
Esempio n. 8
0
def run_models(words,
               models,
               verbose,
               train=True,
               test=True,
               embeddings=False):
    '''
    Runs all the models that are specified with the specified word set.
    It runs all preporocessing steps necessary for the models specified
    Note: If a model is specified twice, it will be run twice, but the preprocessing
    on the input data will not(useful to test for model parameter initialization)
    
    Returns a list containing the the objects of the models used, 
        the outputs they predicted and 
        the sklearn classification reports (dictionary format), 
        in the order where they were provided
        
    Keyword arguments:
        words: list of list of words and features. 
            Format: n*m. n=nr of words, m=nr features + expected output (single)
        models: a string containing the model names. Order is not important.
            Possible models are: NB, LR, SVM, HMM, CRF. Coming soon: CNN
            If a model is specified twice, it will be run twice. The input is
            randomized only once, where applicable
        veboose: 0: print nothing
                1: print results
                2: print status messages:
                3: print both
    '''
    # Preparing data for one-hot encodign -- converts strings into integers
    if any(i in models for i in ['NB', 'LR', 'SVM']):
        verbose | 2 and print('Initial pre-processing...')
        if embeddings:
            stems = [word[0] for word in words]
            words = [word[1:] for word in words]
        X, Y, transl, labels_num, labels_name = create_dataset(words)

    #Algorithm uses sentences (list of list of tuples): HMM
    if 'HMM' in models:
        verbose | 2 and print('Preprocessing data for HMM...')
        sentences_hmm, symbols, tag_set = words2tuples(words)
        _, y_train, _, y_test = split_tr([], sentences_hmm, 0.8)
        x_test = [[tup[0] for tup in sentence] for sentence in y_test]
        y_test = [[tup[1] for tup in sentence] for sentence in y_test]
        #shuffle_parallel(x_test,y_test)
        data_hmm = data_wrap(None, y_train, x_test, y_test)

    # Algorithms using shuffled, one-hot data:NB,LR,SVM
    if any(i in models for i in ['NB', 'LR', 'SVM']):
        verbose | 2 and print('Preprocessing data for NB, LR and/or SVM...')
        indexes = shuffle_parallel(X, Y)
        X_onehot_sh = one_hot(X, transl)
        if embeddings:
            verbose | 2 and print('Loading and generating embeddings...')
            X_onehot_sh = embeddings.insert_embeddings(X_onehot_sh, stems,
                                                       indexes)
        x_train_oh_sh, y_train_oh_sh, x_test_oh_sh, y_test_oh_sh = split_tr(
            X_onehot_sh, Y, 0.8)
        data_shuffled = data_wrap(x_train_oh_sh, y_train_oh_sh, x_test_oh_sh,
                                  y_test_oh_sh, transl, labels_num,
                                  labels_name)

    #Ordered, using sentences (list of list of dict): CRF
    if 'CRF' in models:
        verbose | 2 and print('Preprocessing data for CRF...')
        tokens_dict, labels_dict = words2dictionary(words)
        shuffle_parallel(tokens_dict, labels_dict)
        tokens_train, labels_train, tokens_test, labels_test = split_tr(
            tokens_dict, labels_dict, 0.8)
        data_dictionary = data_wrap(tokens_train, labels_train, tokens_test,
                                    labels_test)

    model_objects = []
    model_results = []
    model_predictions = []

    #removes clutter when calling the functions separately
    #Using a list of function handlers could also be used, but I find that to be
    #less intuitive
    def _add_to_output(model_y_pred):
        model_objects.append(model_y_pred[0])
        model_results.append(model_y_pred[1])
        if (len(model_y_pred) > 2):
            model_predictions.append(model_y_pred[2])

    #Run each of the models from the paramters, while KEEPING THE ORDER they were called in
    #and append it to the return lists
    for model in models:
        if 'HMM' in model:
            verbose | 2 and print('Running HMM from nltk...')
            _add_to_output(HMM(data_hmm, symbols, tag_set, verbose | 1))

        if 'NB' in model:
            verbose | 2 and print('Running NB ' +
                                  ('with ' if embeddings else 'without ') +
                                  'embeddings...')
            if embeddings:
                _add_to_output(NB_cont(data_shuffled, verbose | 1))
            else:
                _add_to_output(NB_disc(data_shuffled, verbose | 1))

        if 'LR' in model:
            verbose | 2 and print('Running LR ' +
                                  ('with ' if embeddings else 'without ') +
                                  'embeddings...')
            _add_to_output(
                LR(data_shuffled, verbose | 1, C=(0.1 if embeddings else 5)))

        if 'SVM' in model:
            verbose | 2 and print('Running SVM ' +
                                  ('with ' if embeddings else 'without ') +
                                  'embeddings...')
            _add_to_output(SVM(data_shuffled, verbose | 1))

        if 'CRF' in model:
            verbose | 2 and print('Running CRF...')
            _add_to_output(CRF(data_dictionary, verbose | 1))

    return model_objects, model_results, model_predictions