Python create_dataset Examples, preprocessing.create_dataset Python Examples

Example #1

0

Show file

def main():
    # Set hyper-parameters.
    batch_size = 32
    epochs = 100
    model_path = 'models/model_{}.h5'
    num_words = 15000

    # Data loading.
    x, y = load_dataset('./data/ja.wikipedia.conll')

    # Pre-processing.
    x = preprocess_dataset(x)
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42)
    source_vocab = Vocab(num_words=num_words, oov_token='<UNK>').fit(x_train)
    target_vocab = Vocab(lower=False).fit(y_train)
    x_train = create_dataset(x_train, source_vocab)
    y_train = create_dataset(y_train, target_vocab)

    # Build models.
    models = [
        UnidirectionalModel(num_words, target_vocab.size).build(),
        BidirectionalModel(num_words, target_vocab.size).build(),
    ]
    for i, model in enumerate(models):
        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

        # Preparing callbacks.
        callbacks = [
            EarlyStopping(patience=3),
            ModelCheckpoint(model_path.format(i), save_best_only=True)
        ]

        # Train the model.
        model.fit(x=x_train,
                  y=y_train,
                  batch_size=batch_size,
                  epochs=epochs,
                  validation_split=0.1,
                  callbacks=callbacks,
                  shuffle=True)

        # Inference.
        model = load_model(model_path.format(i))
        api = InferenceAPI(model, source_vocab, target_vocab)
        y_pred = api.predict_from_sequences(x_test)
        print(classification_report(y_test, y_pred, digits=4))

Example #2

0

Show file

File: train.py Project: greatestgoat/nlp_knowledge

def main():
    # ハイパーパラメータの設定
    batch_size = 32
    epochs = 100
    # model_path = 'models/unidirectional_model.h5'
    model_path = 'models/bidirectional_model.h5'
    num_words = 15000

    # データ・セットの読み込み
    x, y = load_dataset('./data/ja.wikipedia.conll')

    # データ・セットの前処理
    x = preprocess_dataset(x)
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42)
    source_vocab = Vocab(num_words=num_words, oov_token='<UNK>').fit(x_train)
    target_vocab = Vocab(lower=False).fit(y_train)
    x_train = create_dataset(x_train, source_vocab)
    y_train = create_dataset(y_train, target_vocab)

    # モデルの構築
    # model = UnidirectionalModel(num_words, target_vocab.size).build()
    model = BidirectionalModel(num_words, target_vocab.size).build()
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

    # コールバックの用意
    callbacks = [
        EarlyStopping(patience=3),
        ModelCheckpoint(model_path, save_best_only=True)
    ]

    # モデルの学習
    model.fit(x=x_train,
              y=y_train,
              batch_size=batch_size,
              epochs=epochs,
              validation_split=0.1,
              callbacks=callbacks,
              shuffle=True)

    # 予測と評価
    model = load_model(model_path)
    api = InferenceAPI(model, source_vocab, target_vocab)
    y_pred = api.predict_from_sequences(x_test)
    print(classification_report(y_test, y_pred, digits=4))

Example #3

0

Show file

File: midi_test.py Project: tarpalsus/master/midi_test.py

def sampleGenerator(midis,
                    batch_size,
                    fs=50,
                    shuffle_piece=False,
                    train=True,
                    midi_batch=5):
    'Generates batches of samples'
    # Infinite loop
    while 1:
        # Generate order of exploration of dataset

        # Generate batches
        #random.shuffle(midis,random.random)
        for i in range(int(len(midis) / midi_batch)):
            # Find list of IDs
            #midis_temp = midis[i*batch_size:(i+1)*batch_size]
            # Generate data

            X, y = create_dataset(midis[i * midi_batch:(i + 1) * midi_batch],
                                  fs=fs,
                                  poly=False)
            #X_train, X_test, y_train, y_test = train_test_split(X, y,
            #                                                   test_size=0.2,
            #                                                  random_state=7)
            X, y = randomize_set(X, y, seed=5)
            if train:
                X = X[:int(0.8 * len(X))]
                y = y[:int(0.8 * len(y))]
                X, y = randomize_set(X, y)
                imax = int(X.shape[0] / batch_size) - 1
                indexes = np.arange(imax)
                for j in indexes:
                    yield X[j * batch_size:(j + 1) *
                            batch_size, :, :], np.squeeze(
                                y[j * batch_size:(j + 1) * batch_size, :, :])
            else:
                X = X[int(0.8 * len(X)):]
                y = y[int(0.8 * len(y)):]
                imax = int(X.shape[0] / batch_size) - 1
                indexes = np.arange(imax)
                if imax == 0:
                    yield X, np.squeeze(y)
                else:
                    for j in indexes:
                        yield X[j * batch_size:(j + 1) *
                                batch_size, :, :], np.squeeze(
                                    y[j * batch_size:(j + 1) *
                                      batch_size, :, :])
                #yield X, np.squeeze(y)
            if shuffle_piece:
                random.shuffle(indexes)

Example #4

0

Show file

File: train.py Project: SkiMsyk/introduction_to_nlp_compass_books

def main():
    # hyper parameter setting 
    emb_dim = 50 
    epochs = 2
    model_path = 'model.h5'
    negative_samples = 1
    num_words = 10000 
    window_size = 1
    
    # corpus
    text = load_data(filepath = '../chap04/data/ja.text8')
    
    # vocablary 
    vocab = build_vocablary(text, num_words) 
    
    # create dataset 
    x, y = create_dataset(text, vocab, num_words, window_size, negative_samples)
    
    # construction of model
    model = EmbeddingModel(num_words, emb_dim)
    model = model.build()
    model.compile(optimizer = 'adam', loss = 'binary_crossentropy')
    
    # callback 
    callbacks = [
        EarlyStopping(patience=1),
        ModelCheckpoint(model_path, save_best_only=True)
    ]
    
    # model 
    model.fit(x=x,y=y,
              batch_size=128,
              epochs=epochs,
              validation_split=0.2,
              callbacks=callbacks)
    
    # prediction 
    model = load_model(model_path)
    api = InferenceAPI(model, vocab)
    pprint(api.most_similar(word='日本'))

Example #5

0

Show file

File: midi_test.py Project: tarpalsus/master/midi_test.py

def train_from_batch(path_to_dir,
                     num_in_batch,
                     model,
                     fs=50,
                     midi_num=None,
                     data_type='roll',
                     poly=False):
    if midi_num:
        midi_files_num = len(os.listdir(path_to_dir)[:midi_num])
    else:
        midi_files_num = len(os.listdir(path_to_dir))
    file_list = os.listdir(path_to_dir)
    random.shuffle(file_list, random.random)

    num_files_in_batch = num_in_batch
    histories = []
    for i in range(0, int(midi_files_num / num_files_in_batch) - 1):
        if data_type == 'roll':
            midis, first, last = parse_directory(
                path_to_dir,
                file_list[i * num_files_in_batch:i * num_files_in_batch +
                          num_files_in_batch])
            X, y = create_dataset(midis, fs=fs, poly=poly)
        else:
            events, encoded, X, y = parse_directory_for_events(
                path_to_dir, fs,
                file_list[i * num_files_in_batch:i * num_files_in_batch +
                          num_files_in_batch])
        print(start - time.time())
        history = model.fit(X,
                            np.squeeze(y),
                            epochs=5,
                            batch_size=128,
                            validation_split=0.2)
        histories.append(history.history)
    X = None
    y = None
    return model, history, combine_history(histories)

Example #6

0

Show file

File: midi_test.py Project: tarpalsus/master/midi_test.py

        #        for i in range(5):
        #            model,history = train_from_batch(path_to_directory,num_in_batch, model)
        #            histories.append(history)
        end = time.time() - start
        print(end)
        #history = combine_history(histories)

        model.save(model_path + ".h5")
        plot_model(model, to_file=model_path + '.png')
        vis(history, model_path)
    else:
        path_to_directory = r"C:\Users\Maciek\Downloads\inputs"
        #path_to_directory=r"C:\Users\user\Desktop\Sound_generator\test"
        file_list = os.listdir(path_to_directory)[:1]
        midis, first, last = parse_directory(path_to_directory, file_list)
        X1, y1 = create_dataset(midis, fs)
        X, y = create_dataset_channels(midis, fs)
        folded = fold(X)
        midi_obj_from_roll = None
        for i, channel in enumerate(folded):
            monophonic_unsq = expand_roll(channel[1, :, :].T,
                                          delete_repress=repress)
            midi_obj_from_roll = piano_roll_to_midi_mono(
                monophonic_unsq.T, fs, midi=midi_obj_from_roll)
        midi_obj_from_roll.write(
            r'C:\Users\Maciek\Downloads\master-master\test_dur5.mid')
        #    print("Dataset generated")
        #    print(start - time.time())
        #    start_learning = time.time()
        #    #
        keras = False

Example #7

0

Show file

File: generator.py Project: tarpalsus/master/generator.py

    print('Sequences generated')
    return seeds, temperatures_high, temperatures_low, temperatures_mid


if __name__ =='__main__':
    model_path = r"C:\Users\user\Desktop\Sound_generator\models\test.h5"
    #model_path = r"C:\Users\Maciek\Downloads\master-master\master-master\lstm_repress_filtered.h5"
    seed_file = r"C:\Users\user\Desktop\Sound_generator\piano_midi\bach_846.mid"
    #seed_file = r"C:\Users\Maciek\Downloads\inputs\bach_846.mid"
    #out_path=r"C:\Users\Maciek\Downloads\master-master\master-master\{}.mid"
    out_path=r"C:\Users\user\Desktop\Sound_generator\midis\{}.mid"
    fs=50
    seq_len = 1000
    midi_file = pretty_midi.PrettyMIDI(seed_file)
    midi_obj = MidiParser(midi_file)
    X,y = create_dataset(midi_obj, fs=fs)
    poly=False

    model = load_model(model_path)
    seeds, high, low, mid = generate(X, model, MAX_LEN, seq_len, poly, iters=5)
    all_notes = []
#    for i,melody in enumerate(mid):
#        try:
#            new_notes = note_events_to_midi(np.squeeze(np.array(melody).T),'gen_mid_events{}.mid'.format(i), fs=50)
#            all_notes.append(new_notes)
#        except:
#            print('Wrong midi created for {}'.format(i))
    for i,melody in enumerate(high):
        x = np.array(melody)
        x = expand_roll(np.squeeze(x).T, delete_repress=True).T
        seed = expand_roll(np.squeeze(seeds[i]).T, delete_repress=True).T

Example #8

0

Show file

def main():
    # Set hyper-parameters.
    batch_size = 32
    epochs = 100
    model_path = 'atmodel.h5'
    enc_arch = 'encoder.json'
    dec_arch = 'decoder.json'
    data_path = '../data/w16to19hukusimaconv.txt'
    num_words = 7000
    num_data = 4367

    # Data loading.
    en_texts, ja_texts = load_dataset(data_path)
    en_texts, ja_texts = en_texts[:num_data], ja_texts[:num_data]

    # Preprocessings.
    #ja_texts = preprocess_ja(ja_texts)
    ja_texts = preprocess_dataset(ja_texts)
    en_texts = preprocess_dataset(en_texts)
    x_train, x_test, y_train, y_test = train_test_split(en_texts,
                                                        ja_texts,
                                                        test_size=0.2,
                                                        random_state=42)

    en_vocab = build_vocabulary(x_train, num_words)
    ja_vocab = build_vocabulary(y_train, num_words)
    print(x_train[:3])
    print(y_train[:3])
    x_train, y_train = create_dataset(x_train, y_train, en_vocab, ja_vocab)

    print(en_vocab.word_index)
    print(ja_vocab.word_index)

    # Build a simple model.
    encoder = Encoder(num_words)
    decoder = Decoder(num_words)
    # Build an attention model.
    #encoder = Encoder(num_words, return_sequences=True)
    #decoder = AttentionDecoder(num_words)
    seq2seq = Seq2seq(encoder, decoder)
    model = seq2seq.build()
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

    # Train the model.
    callbacks = [
        EarlyStopping(patience=10),
        ModelCheckpoint(model_path,
                        save_best_only=True,
                        save_weights_only=True)
    ]
    """
    model.fit(x=x_train,
              y=y_train,
              batch_size=batch_size,
              epochs=epochs,
              callbacks=callbacks,
              validation_split=0.1)"""
    encoder.save_as_json(enc_arch)
    decoder.save_as_json(dec_arch)

    # Inference.
    encoder = Encoder.load(enc_arch, model_path)
    decoder = Decoder.load(dec_arch, model_path)
    api = InferenceAPI(encoder, decoder, en_vocab, ja_vocab)
    #api = InferenceAPIforAttention(encoder, decoder, en_vocab, ja_vocab)
    texts = sorted(set(en_texts[:50]), key=len)
    texts = ["お聞きしたいと思います", "さっき の 答弁 全く 納得 できません", "全く 納得 い き ません", "ありがとうございました", "おはようございます",\
            "よろしいでしょうか", "是非 よろしくお願いいたします", "もう少し 具体的に 教えて いただける と 助 か る んですけれども", "ちょっと 待 って", "質問 主 意 書 では 当然 混 同 は しておりません",\
            "正 式 な 要求 でいい んですか", "時間ですので まとめて ください", "ちょっと 静粛に お願いします", "よろしいですか", "静粛に お願いします",\
            "答弁 を まとめて ください", "時間 ですから", "驚 き の答弁 ですね", "それは いつ ごろ でしょうか", "そのとおり です"
    ]
    for text in texts:
        decoded = api.predict(text=text)
        print('入力: {}'.format(text))
        print('応答: {}'.format(decoded))

    y_test = [y.split(' ')[1:-1] for y in y_test]
    bleu_score = evaluate_bleu(x_test, y_test, api)
    print('BLEU: {}'.format(bleu_score))

Example #9

0

Show file

from preprocessing import create_dataset
from models import *
import torch
import utils

#create dataset
train_packs, train_picks, test_packs, test_picks = create_dataset()
#initialize model with 249 cards and 15 archetypes
rank_model = RankingNet(249, 15)
optimizer = torch.optim.Adam(rank_model.parameters(), lr=0.1)
#cross entropy loss function
# --> this works well for this problem because we are optimizing
#     for a pick out of a set of options that can be described in
loss_function = torch.nn.CrossEntropyLoss()
#only consider picks where the player has likely solidified their
#archetype (e.g., early in pack 2)
train_x = torch.flatten(train_packs[:, 16:, :], start_dim=0, end_dim=1)
train_y = torch.flatten(train_picks[:, 16:, :], start_dim=0, end_dim=1)
#train the model
utils.train(rank_model, loss_function, optimizer, train_x, train_y, epochs=5)
torch.save(rank_model, 'Saved_Models/rank_model.pkl')
#initialize drafting model with learned weights from rank model
init_weights = rank_model.ranking_matrix.detach()
#normalize the weights such that 1 is the largest initial weight
smaller_init_weights = init_weights / init_weights.max(0, keepdim=True)[0]
draft_model = DraftNet(smaller_init_weights)
optimizer = torch.optim.Adam(draft_model.parameters(), lr=0.1)
#flatten the drafts so that the algorithm only considers each pick
#individually and remove archetype label to avoid leakage
train_x = torch.flatten(train_packs, start_dim=0, end_dim=1)[:, 1:]
train_y = torch.flatten(train_picks, start_dim=0, end_dim=1)

Example #10

0

Show file

from preprocessing import create_dataset
from models import *
import torch
import matplotlib.pyplot as plt
import utils
import sys

#flags for train-test-split and saving the models
#potential update: have these be command line params
full_flag = False
save = True
#name = '_simple'
#create dataset
train_packs, train_picks, test_packs, test_picks = create_dataset(
    full_dataset=full_flag, save_clusters=save)
#initialize model with 249 cards and 15 archetypes
rank_model = RankingNet(249, 15)
optimizer = torch.optim.Adam(rank_model.parameters(), lr=0.1)
#cross entropy loss function
# --> this works well for this problem because we are optimizing
#	 for a pick out of a set of options that can be described in
loss_function = torch.nn.CrossEntropyLoss()
#only consider picks where the player has likely solidified their
#archetype (e.g., early in pack 2)
train_x = torch.flatten(train_packs[:, 16:, :], start_dim=0, end_dim=1)
train_y = torch.flatten(train_picks[:, 16:, :], start_dim=0, end_dim=1)
#train the model
train_loss = utils.train(rank_model,
                         loss_function,
                         optimizer,
                         train_x,

Example #11

0

Show file

# Narrow down dataset to only selected country
country = ' '.join(parsed.country)
assert country in all_countries, f'\'{country}\' is not a valid choice'

dataset = full_dataset.loc[country].price.values

# Scale data to range of 0-1
dataset = np.reshape(dataset, (-1, 1))
scaler = MinMaxScaler(feature_range=(0, 1))
dataset = scaler.fit_transform(dataset)

# Create X and Y sets
train, test = preprocessing.split_dataset(dataset, ratio=0.8)

seq_len = parsed.seq_len
X_train, y_train = preprocessing.create_dataset(train, seq_len)
X_test, y_test = preprocessing.create_dataset(test, seq_len)

X_train = X_train.reshape((-1, seq_len, 1))
X_test = X_test.reshape((-1, seq_len, 1))

# Make training model
neurons = parsed.hidden_neurons
batch_sz = parsed.batch_size

training_model = Sequential()
training_model.add(
    LSTM(neurons, input_shape=(seq_len, 1), return_sequences=True))

training_model.add(LSTM(neurons, return_sequences=True))
training_model.add(Dense(1, activation='linear'))

Example #12

0

Show file

    # ハイパーパラメータの設定
    emb_dim = 50
    epochs = 10
    model_path = 'model.h5'
    negative_samples = 1
    num_words = 10000
    window_size = 1

    # コーパスの読み込み
    text = load_data(filepath='data/ja.text8')

    # ボキャブラリの構築
    vocab = build_vocabulary(text, num_words)

    # データ・セットの作成
    x, y = create_dataset(text, vocab, num_words, window_size, negative_samples)

    # モデルの構築
    model = EmbeddingModel(num_words, emb_dim)
    model = model.build()
    model.compile(optimizer='adam', loss='binary_crossentropy')

    # コールバックの用意
    callbacks = [
        EarlyStopping(patience=1),
        ModelCheckpoint(model_path, save_best_only=True)
    ]

    # モデルの学習
    model.fit(x=x,
            y=y,

Example #13

0

Show file

File: test.py Project: sarahzhouUestc/KaggleLeafClassfication

def main(argv=None):
    trainX, testX, trainY, testY = preprocessing.create_dataset(0.1)
    eval(testX, testY)

Example #14

0

Show file

        exists = os.path.isfile(os.path.join(config.resources_dir, config.final_txt))
        # load the final converted text containing the list of lists for the
        # training if it already exists
        if exists:
            fileinfo = os.stat(os.path.join(config.resources_dir, config.final_txt))
            if fileinfo.st_size > 9500000:
                # print("jp2")
                print("final text file already exists loading it from %s..."\
                      %os.path.join(config.resources_dir, config.final_txt).split("/")[-1])
                with open(os.path.join(config.resources_dir, config.final_txt),'r') as txtfile:
                    training_list = json.load(txtfile)

        # else create it
        else:
            training_list = create_dataset(resources_dir=args.resources_dir,
            annotation_dict=args.annotation_dict, senseXml1=args.senseXml1,
            bn2wn_mapping_txt=args.bn2wn_mapping_txt)
    except:
    training_list = create_dataset(resources_dir=args.resources_dir,
                                   annotation_dict=args.annotation_dict,
                                   senseXml1=args.senseXml1,
                                   bn2wn_mapping_txt=args.bn2wn_mapping_txt)

    # create a dict containing the grid search parameters
    grid_params = {'min_count':config.min_count,
                   'window':config.window,
                   'size':config.size,
                   'sample':config.sample,
                   'alpha':config.alpha,
                   'min_alpha':config.min_alpha,
                   'negative':config.negative,

Example #15

0

Show file

File: learn.py Project: FrostByte266/econ-ml

event, values = window.Read()

selection = values['select']
window.Close()

df = pd.read_csv('/data/exchange.csv',
                 parse_dates=['date'],
                 index_col='country').loc[selection].dropna()
dataset = df.price.values
dates = df.date.values

dataset, scaler = preprocessing.normalize_dataframe(dataset)
train, test = preprocessing.split_dataset(dataset, ratio=0.6)

look_back = 30
X_train, Y_train = preprocessing.create_dataset(train, look_back)
X_test, Y_test = preprocessing.create_dataset(test, look_back)

# reshape input to be [samples, time steps, features]
X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
X_test = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))
train_shape = X_train.shape

# assert False

X_train = tf.data.Dataset.from_tensor_slices((X_train, Y_train))

X_train = X_train.shuffle(10000).batch(64, drop_remainder=True)

model = networks.build_model(train_shape,
                             neurons=64,

Example #16

0

Show file

def main():
    # ハイパーパラメータの設定
    batch_size = 32
    epochs = 100
    model_path = 'models/attention_model.h5'
    enc_arch = 'models/encoder.json'
    dec_arch = 'models/decoder.json'
    data_path = 'data/jpn.txt'
    num_words = 10000
    num_data = 20000

    # データ・セット読み込み
    en_texts, ja_texts = load_dataset(data_path)
    en_texts, ja_texts = en_texts[:num_data], ja_texts[:num_data]

    # データ・セットの前処理
    ja_texts = preprocess_ja(ja_texts)
    ja_texts = preprocess_dataset(ja_texts)
    en_texts = preprocess_dataset(en_texts)
    x_train, x_test, y_train, y_test = train_test_split(en_texts,
                                                        ja_texts,
                                                        test_size=0.2,
                                                        random_state=42)

    en_vocab = build_vocabulary(x_train, num_words)
    ja_vocab = build_vocabulary(y_train, num_words)
    x_train, y_train = create_dataset(x_train, y_train, en_vocab, ja_vocab)

    # モデルの構築
    encoder = Encoder(num_words, return_sequences=True)
    decoder = AttentionDecoder(num_words)
    seq2seq = Seq2seq(encoder, decoder)
    model = seq2seq.build()
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

    # コールバックの用意
    callbacks = [
        EarlyStopping(patience=3),
        ModelCheckpoint(model_path,
                        save_best_only=True,
                        save_weights_only=True)
    ]

    # モデルの学習
    model.fit(x=x_train,
              y=y_train,
              batch_size=batch_size,
              epochs=epochs,
              callbacks=callbacks,
              validation_split=0.1)
    encoder.save_as_json(enc_arch)
    decoder.save_as_json(dec_arch)

    # 予測
    encoder = Encoder.load(enc_arch, model_path)
    decoder = Decoder.load(dec_arch, model_path)
    api = InferenceAPIforAttention(encoder, decoder, en_vocab, ja_vocab)
    texts = sorted(set(en_texts[:50]), key=len)
    for text in texts:
        decoded = api.predict(text=text)
        print('English : {}'.format(text))
        print('Japanese: {}'.format(decoded))

    # 性能評価
    y_test = [y.split(' ')[1:-1] for y in y_test]
    bleu_score = evaluate_bleu(x_test, y_test, api)
    print('BLEU: {}'.format(bleu_score))

Example #17

0

Show file

def run_models(words,
               models,
               verbose,
               train=True,
               test=True,
               embeddings=False):
    '''
    Runs all the models that are specified with the specified word set.
    It runs all preporocessing steps necessary for the models specified
    Note: If a model is specified twice, it will be run twice, but the preprocessing
    on the input data will not(useful to test for model parameter initialization)
    
    Returns a list containing the the objects of the models used, 
        the outputs they predicted and 
        the sklearn classification reports (dictionary format), 
        in the order where they were provided
        
    Keyword arguments:
        words: list of list of words and features. 
            Format: n*m. n=nr of words, m=nr features + expected output (single)
        models: a string containing the model names. Order is not important.
            Possible models are: NB, LR, SVM, HMM, CRF. Coming soon: CNN
            If a model is specified twice, it will be run twice. The input is
            randomized only once, where applicable
        veboose: 0: print nothing
                1: print results
                2: print status messages:
                3: print both
    '''
    # Preparing data for one-hot encodign -- converts strings into integers
    if any(i in models for i in ['NB', 'LR', 'SVM']):
        verbose | 2 and print('Initial pre-processing...')
        if embeddings:
            stems = [word[0] for word in words]
            words = [word[1:] for word in words]
        X, Y, transl, labels_num, labels_name = create_dataset(words)

    #Algorithm uses sentences (list of list of tuples): HMM
    if 'HMM' in models:
        verbose | 2 and print('Preprocessing data for HMM...')
        sentences_hmm, symbols, tag_set = words2tuples(words)
        _, y_train, _, y_test = split_tr([], sentences_hmm, 0.8)
        x_test = [[tup[0] for tup in sentence] for sentence in y_test]
        y_test = [[tup[1] for tup in sentence] for sentence in y_test]
        #shuffle_parallel(x_test,y_test)
        data_hmm = data_wrap(None, y_train, x_test, y_test)

    # Algorithms using shuffled, one-hot data:NB,LR,SVM
    if any(i in models for i in ['NB', 'LR', 'SVM']):
        verbose | 2 and print('Preprocessing data for NB, LR and/or SVM...')
        indexes = shuffle_parallel(X, Y)
        X_onehot_sh = one_hot(X, transl)
        if embeddings:
            verbose | 2 and print('Loading and generating embeddings...')
            X_onehot_sh = embeddings.insert_embeddings(X_onehot_sh, stems,
                                                       indexes)
        x_train_oh_sh, y_train_oh_sh, x_test_oh_sh, y_test_oh_sh = split_tr(
            X_onehot_sh, Y, 0.8)
        data_shuffled = data_wrap(x_train_oh_sh, y_train_oh_sh, x_test_oh_sh,
                                  y_test_oh_sh, transl, labels_num,
                                  labels_name)

    #Ordered, using sentences (list of list of dict): CRF
    if 'CRF' in models:
        verbose | 2 and print('Preprocessing data for CRF...')
        tokens_dict, labels_dict = words2dictionary(words)
        shuffle_parallel(tokens_dict, labels_dict)
        tokens_train, labels_train, tokens_test, labels_test = split_tr(
            tokens_dict, labels_dict, 0.8)
        data_dictionary = data_wrap(tokens_train, labels_train, tokens_test,
                                    labels_test)

    model_objects = []
    model_results = []
    model_predictions = []

    #removes clutter when calling the functions separately
    #Using a list of function handlers could also be used, but I find that to be
    #less intuitive
    def _add_to_output(model_y_pred):
        model_objects.append(model_y_pred[0])
        model_results.append(model_y_pred[1])
        if (len(model_y_pred) > 2):
            model_predictions.append(model_y_pred[2])

    #Run each of the models from the paramters, while KEEPING THE ORDER they were called in
    #and append it to the return lists
    for model in models:
        if 'HMM' in model:
            verbose | 2 and print('Running HMM from nltk...')
            _add_to_output(HMM(data_hmm, symbols, tag_set, verbose | 1))

        if 'NB' in model:
            verbose | 2 and print('Running NB ' +
                                  ('with ' if embeddings else 'without ') +
                                  'embeddings...')
            if embeddings:
                _add_to_output(NB_cont(data_shuffled, verbose | 1))
            else:
                _add_to_output(NB_disc(data_shuffled, verbose | 1))

        if 'LR' in model:
            verbose | 2 and print('Running LR ' +
                                  ('with ' if embeddings else 'without ') +
                                  'embeddings...')
            _add_to_output(
                LR(data_shuffled, verbose | 1, C=(0.1 if embeddings else 5)))

        if 'SVM' in model:
            verbose | 2 and print('Running SVM ' +
                                  ('with ' if embeddings else 'without ') +
                                  'embeddings...')
            _add_to_output(SVM(data_shuffled, verbose | 1))

        if 'CRF' in model:
            verbose | 2 and print('Running CRF...')
            _add_to_output(CRF(data_dictionary, verbose | 1))

    return model_objects, model_results, model_predictions