Ejemplo n.º 1
0
def rp_to_estimate_noise(data_params, n_examples, n_runs, delta_matrix):
    train, val, test = get_data(data_params, n_examples, n_runs, delta_matrix)
    X_train, y_train, y_train_tildes = train
    X_val, y_val = val
    X_test, y_test = test
    noise_matrices = []

    for y_train_tilde in y_train_tildes:
        lnl = cleanlab.classification.LearningWithNoisyLabels(
            clf=LogisticRegression(solver='lbfgs',
                                   multi_class='multinomial',
                                   class_weight='balanced'))
        lnl.fit(X_train, y_train_tilde)
        y_pred = lnl.predict(X_test)
        y_train_pred = lnl.predict(X_train)

        lr = LogisticRegression(solver='lbfgs',
                                multi_class='multinomial',
                                class_weight='balanced')
        lr.fit(X_train, y_train_tilde)
        y_pred = lr.predict(X_test)
        y_pred_proba = lr.predict_proba(X_train)
        noise_matrix = estimate_noise_matrix(X_train, y_train_tilde,
                                             y_train_pred, y_pred_proba)
        noise_matrices.append(noise_matrix)

    avg_noise_matrix = np.mean(noise_matrices, axis=0)
    return avg_noise_matrix
Ejemplo n.º 2
0
    def run(self, sess, network, world, root, iterations=1600, tau=1):

        if root.parent is not None:
            raise ValueError("Root's parent must be None.")

        for i in range(iterations):

            node, _, rew, raw_obs = find_leaf(root,
                                              self.tree_policy,
                                              world=deepcopy(world))

            # Expectimax-ish
            #reward = node.simulation(world, rew)
            ob = get_data(np.array(raw_obs)[-2:])[node.id:node.id + 1]
            P, v = sess.run(
                [network.softmax_policy, network.pred_Q],
                feed_dict={
                    network.obs: np.array([[y.A for y in x] for x in ob]),
                    network.training_flag: False
                })

            node.Ps = P[0]

            self.backup(node, v[0][0])

        pi = [0] * len(root.actions)
        for action in np.arange(len(root.actions)):
            pi[action] = root.children[str(action)].N**(1 / tau)

        pi = np.array(pi)
        return pi / float(np.sum(pi))
Ejemplo n.º 3
0
def load_model():
    global MODEL
    dataset, ids_from_chars, chars_from_ids = train.get_data(
        'discord_data.txt')
    model = train.create_model(ids_from_chars)
    train.restore(
        model, 20,
        os.path.join('./training_checkpoints_discord_2', "ckpt_{epoch}.ckpt"))
    MODEL = train.OneStep(model, chars_from_ids, ids_from_chars)
Ejemplo n.º 4
0
def classified_data():
    total_data = get_data(is_test=True)
    # ['ASIN', 'FILENAME', 'IMAGE_URL', 'TITLE', 'AUTHOR', 'CATEGORY_ID', 'CATEGORY']
    classified_data_list = []
    class_names = []
    for i in range(30):
        classified_data_list.append(total_data[total_data.CATEGORY_ID == i])
        class_names.append(
            total_data[total_data.CATEGORY_ID == i].CATEGORY.values[0])
    return classified_data_list, class_names
Ejemplo n.º 5
0
def main():
    """
    DEPRACATED (only one class)
    Load and validate a model. Generate and save precision and recall scores.
    :return:
    """
    train_labels = train.get_data()
    modelOfInterest = "17-7-25/"
    model = train.standard_load_model(modelOfInterest)
    # load test data
    batch_size = 30
    valid_l = 28000
    valid_h = 31000
    dm = 512
    predictions = 3
    test_generator = train.ImageSequence(
        train_labels=train_labels[valid_l:valid_h],
        batch_size=batch_size,
        dm=dm,
        start=valid_l,
        predictions=predictions)
    # initialize confusion matrix
    membership = 0  # column of predicted and actual results to examine
    a0 = np.zeros((2, 2))
    # build confusion matrix
    num_batches = test_generator.__len__()
    for batch in range(num_batches):
        x_test, y_act = test_generator.__getitem__(batch)
        y_act = np.round(y_act)
        y_pred = np.round(model.predict(x_test), 0)

        print(str(batch) + " out of " + str(num_batches))
        for i in range(y_pred.shape[0]):
            # this produces backwards results for model ants/ and model showers/
            prediction = int(
                y_pred[i]
                [membership])  # real nice ... look at a diff column when
            actual = int(y_act[i][membership])  # building the confusion matrix
            a0[prediction][actual] += 1
    # calculate performance metrics
    class0 = a0[0][1] + a0[1][1]
    recall = a0[1][1] / class0
    exclaim = a0[1][0] + a0[1][1]
    precision = a0[1][1] / exclaim
    # save the results
    notes = [
        "file: train.csv", "range: " + str(valid_l) + ":" + str(valid_h),
        "batch size: " + str(batch_size)
    ]
    precision = ["precision: ", str(precision)]
    recall = ["recall: ", str(recall)]
    write_performance_single(modelOfInterest, a0, precision, recall, notes)
Ejemplo n.º 6
0
def main():
    lrs = [.01, .1, 1]
    beta1s = [.9]
    beta2s = [.999]
    epsilons = [.1]

    # lrs = [.01, .1]
    # beta1s = [.8]
    # beta2s = [.999]
    # epsilons = [.1]

    train_labels = train.get_data()

    search_parameters(lrs, beta1s, beta2s, epsilons, train_labels=train_labels)
Ejemplo n.º 7
0
def train(epochs=1, batch_size=128, path=''):
    # Import the MNIST dataset using Keras, will only
    # use the 60,000 training examples.
    (X_train, _), _ = get_data(True)

    # Creating GAN
    generator = make_generator()
    discriminator = make_discriminator()
    adversial_net = make_adversial_network(generator, discriminator)

    visualize_generator(0, generator, path=path)
    for epoch in range(epochs):
        print(f'Epoch {epoch+1}')

        discr_loss = 0
        gen_loss = 0
        for _ in tqdm(range(batch_size)):
            noise = generate_latent_noise(batch_size)
            generated_images = generator.predict(noise)

            real_images = X_train[np.random.choice(X_train.shape[0],
                                                   batch_size,
                                                   replace=False)]

            discrimination_data = np.concatenate(
                [real_images, generated_images])

            # Labels for generated and real data, uses soft label trick
            discrimination_labels = 0.1 * np.ones(2 * batch_size)
            discrimination_labels[:batch_size] = 0.9

            # To train, we alternate between training just the discriminator
            # and just the generator.
            discriminator.trainable = True
            discr_loss += discriminator.train_on_batch(discrimination_data,
                                                       discrimination_labels)

            # Trick to 'freeze' discriminator weights in adversial_net. Only
            # the generator weights will be changed, which are shared with
            # the generator.
            discriminator.trainable = False
            # N.B, changing the labels because now we want to 'fool' the
            # discriminator.
            gen_loss += adversial_net.train_on_batch(noise,
                                                     np.ones(batch_size))

        print(f'Discriminator Loss: {discr_loss/batch_size}')
        print(f'Generator Loss:     {gen_loss/batch_size}')
        visualize_generator(epoch + 1, generator, path=path)
Ejemplo n.º 8
0
 def get(self, crypto):
     result = []
     _, _, norm, actual_data = get_data(filename=crypto + ".csv",
                                        num_days_to_predict=1)
     actual_data *= norm
     print(actual_data.shape)
     values = []
     i = 0
     for data in actual_data:
         values.append({
             "x": str(-actual_data.shape[0] + i),
             "y": float(data[0])
         })
         i += 1
     return values[-100:]
Ejemplo n.º 9
0
def learn_with_noisy_labels(data_params, n_examples, n_runs, delta_matrix):
    train, val, test = get_data(data_params, n_examples, n_runs, delta_matrix)
    X_train, y_train, y_train_tildes = train
    X_val, y_val = val
    X_test, y_test = test

    rp_scores = []
    baseline_noisy_scores = []
    baseline_clean_scores = []
    for y_train_tilde in y_train_tildes:
        lnl = cleanlab.classification.LearningWithNoisyLabels(
            clf=LogisticRegression(solver='lbfgs',
                                   multi_class='multinomial',
                                   class_weight='balanced'))
        lnl.fit(X_train, y_train_tilde)
        y_pred = lnl.predict(X_test)
        rp_scores.append(f1_score(y_test, y_pred, average='weighted') * 1.0)

        lr = LogisticRegression(solver='lbfgs',
                                multi_class='multinomial',
                                class_weight='balanced')
        lr.fit(X_train, y_train_tilde)
        y_pred = lr.predict(X_test)
        baseline_noisy_scores.append(
            f1_score(y_test, y_pred, average='weighted') * 1.0)

    lr = LogisticRegression(solver='lbfgs',
                            multi_class='multinomial',
                            class_weight='balanced')
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    baseline_clean_scores.append(
        f1_score(y_test, y_pred, average='weighted') * 1.0)

    scores = [rp_scores, baseline_clean_scores, baseline_noisy_scores]
    name = ["rp", "baseline_clean", "baseline_noisy"]
    res = []
    for sc in scores:
        res.append((sum(sc) / len(sc), np.std(np.array(sc))))
    return name, res
Ejemplo n.º 10
0
def main():
    start_time = time()
    data = get_data()
    X_train = np.asarray(data['X_train'])
    X_dev = np.asarray(data['X_dev'])
    X_test = data['X_test']
    X_train_feats = np.asarray(data['X_train_feats'])
    X_dev_feats = np.asarray(data['X_dev_feats'])
    X_test_feats = data['X_test_feats']
    y_train = np.asarray(data['y_train'])
    y_dev = np.asarray(data['y_dev'])
    y_test = data['y_test']
    word_index = data['word_index']
    tag_index = data['tag_index']
    feature_sizes = data['feature_sizes']

    tag_size = len(tag_index)
    vocab_size = len(word_index)

    embedding_matrix = get_embedding_matrix(word_index)

    print('\nLoading candidate predictions for training data...')


    print('\nTraining reranking models...')
    model = LSTMReranking(vocab_size, feature_sizes, tag_size, embedding_matrix)
    model.summary()

    sgd = SGD(lr=0.01, momentum=0.7, clipnorm=5)
    model.compile(optimizer=sgd, loss='categorical_crossentropy', metrics=['acc'])

    print('\nTrain...')
    checkpointer = ModelCheckpoint(MODEL_FILE,
                                   monitor='val_acc',
                                   verbose=1,
                                   save_best_only=True,
                                   save_weights_only=False,
                                   mode='auto')

    early_stopping = EarlyStopping(monitor='val_acc',
                                   min_delta=0,
                                   patience=3,
                                   verbose=1,
                                   mode='auto')

    model.fit_generator(data_generator(X_train, X_train_feats, y_train, tag_size, batch_size=batch_size),
                        samples_per_epoch=len(X_train)//batch_size*batch_size,
                        nb_epoch=nb_epoch,
                        verbose=1,
                        callbacks=[checkpointer, early_stopping],
                        validation_data=data_generator(X_test, X_test_feats, y_test, tag_size, batch_size=batch_size),
                        nb_val_samples=len(X_test)//batch_size*batch_size
                        )
    model.load_weights(MODEL_FILE)

    print('\nTesting...')
    _, acc = model.evaluate_generator(data_generator(X_test, X_test_feats, y_test, tag_size),
                                      val_samples=len(X_test))

    print('Test accuracy: {}.'.format(acc))

    seconds = time() - start_time
    minutes = seconds / 60
    print('[Finished in {} seconds ({} minutes)]'.format(str(round(seconds, 1)),
                                                         str(round(minutes, 1))))
Ejemplo n.º 11
0
from datetime import datetime
import os

import tensorflow as tf
import numpy as np
from config import Config
from dataset import MidiDataset
from model import get_model
from train import get_data

if __name__ == "__main__":
    filepath = "/home/guy/melody_completions/runs/run_20200613_215938/model.33-0.14.h5"
    x_train, y_train, x_test, y_test = get_data(
        base_folder=Config().BASE_FOLDER)

    model = get_model()

    model.load_weights(filepath)

    y_pred = model.predict(x_train[:1])
Ejemplo n.º 12
0
def main():
    # Pre-trained model
    VALID_ARCH_CHOICES = ("vgg16", "vgg13", "densenet121")

    # Parse command line arguments
    ap = argparse.ArgumentParser()
    ap.add_argument("data_dir",
                    help="Directory containing the dataset (default: data)",
                    default="data",
                    nargs="?")
    ap.add_argument(
        "--arch",
        help="Model architecture from 'torchvision.models' (default: vgg16)",
        choices=VALID_ARCH_CHOICES,
        default=VALID_ARCH_CHOICES[0])
    #    ap.add_argument("--hidden-units",
    #                    help="Number of units the hidden layer should consist of (default: 512)",
    #                    default=512,
    #                    type=int)
    ap.add_argument(
        "--cpu",
        help="Use CPU (else GPU) for training (default if not set: GPU)",
        action="store_true")
    args = ap.parse_args()

    device = "cpu" if args.cpu else "cuda"
    args.device = device
    args.noise = 0.25
    args.clip = 1.0
    args.batch_size = 64
    args.hidden_units = 256
    args.delta = 1e-4

    # Build model: chose loss function, optimizer, processor support
    #    # Done later to reset the model
    #    model = hybrid_model(arch=args.arch, hidden_units=args.hidden_units)
    criterion = nn.NLLLoss()
    device = "cpu" if args.cpu else "cuda"

    # ===== TUNING ===========================================================
    # Hyperparameters to test
    lr_range = [1e-4]  #####  <== choice (enumeration)
    batch_size_range = [
        32, 16, 8, 2
    ]  #, 32, 128, 8, 4,  1] #####  <== choice (enumeration)
    epochs = 30  #####  <== choice (1 value=max)
    # Number of iteration for each parameter
    iter = 1  #####  <== choice (single value)

    # DP or not DP, that is the question
    args.disable_dp = False  #####  <== choice (boolean)
    # ========================================================================

    # File to export results
    dp_or_not = "noDP_" if args.disable_dp else "DP_"
    file = "experiment_stats/accuracy_data_" + dp_or_not
    file += str(datetime.datetime.today()).replace(' ', '_') + ".csv"

    steps = len(lr_range) * len(batch_size_range) * iter
    step = 0

    # Write column titles
    with open(file, 'w') as f:
        f.write(
            'learning_rate, batch_size, n_epochs, accuracy, n_times_for_avg\n')

    # Experiment loops
    for lr in lr_range:
        args.learning_rate = lr

        for bs in batch_size_range:
            args.batch_size = bs
            # Load the dataset into a dataloader  ### default test batch size ###
            trainloader, testloader, mapping = get_data(
                data_folder=args.data_dir, batch_size=bs)
            args.sample_size = len(trainloader.dataset)

            #for epochs in epochs_range:
            accuracy_sum = []

            for _ in range(iter):
                # Reset the model
                model, optimizer = hybrid_model(arch=args.arch,
                                                hidden_units=args.hidden_units,
                                                args=args)
                step += 1
                _, acc = train(
                    model=model,
                    trainloader=trainloader,
                    testloader=testloader,
                    epochs=epochs,
                    print_every=None,
                    criterion=criterion,
                    optimizer=optimizer,
                    device=device,
                    arch=args.arch,
                    model_dir='',
                    serialize=False,
                    detail=False,
                    args=args,
                )
                acc = np.multiply(acc, 100)
                accuracy_sum.append(acc)
                print(f' {step}/{steps}\tlr={lr}, bs={bs},')
                for n_epoch, accur in enumerate(acc, start=1):
                    line = f'{lr}, {bs}, {n_epoch}, {accur:.2f}, 1\n'
                    with open(file, 'a') as f:
                        f.write(line)
                    print(f'\t. ×{n_epoch} epoch{"s" if n_epoch > 1 else " "}'
                          f' => accuracy = {accur:.2f}%')

            # Sum up for identical settings, repeted `iter` times
            if iter > 1:
                acc_avg = np.average(accuracy_sum, axis=0)
                for n_epoch, accur in enumerate(acc_avg, start=1):
                    line = f'{lr}, {bs}, {n_epoch}, {accur:.2f}, {iter}\n'
                    with open(file, 'a') as f:
                        f.write(line)
                    print(
                        f'\t\t>>> Average on {iter} iterations >>>\tlr={lr}, bs={bs},'
                        f' ×{n_epoch} epoch{"s" if n_epoch > 1 else " "}'
                        f' => accuracy = {accur:.2f}%')
Ejemplo n.º 13
0
def eval_score(results_file_base,
               results_file_ext,
               weights_filename,
               saliency=False):
    '''
    Evaluate a trained model on the score dataset

    Arguments
    - results_file_base: str
        Base results file name. Usually includes run_id but leaves out file extension
    - results_file_ext: str
        Results file extension, exluduing the period (e.g. 'results')
    - weights_filename: str
        Filename of saved Tensorflow weights
    - saliency: bool, default = False
        Whether to compute and plot the saliency map
    '''
    # read results of best run
    results_file = results_file_base + '.' + results_file_ext
    results_file_dtypes = results_file + '_dtypes'
    # dtypes_series = pd.read_csv('dtype_series', header=None)
    # dtypes_series = dtypes_series.set_index(0).squeeze()
    # dtypes_dict = dtypes_series.to_dict()
    df = pd.read_csv(results_file, header=0, float_precision='high',
                     sep='\t')  # dtype=dtypes_dict
    series = df.iloc[0]
    params = series.to_dict()

    # Get data
    datasets = ['train', 'test', 'score']
    metrics = ['loss', 'acc', 'auroc', 'auroc_sk']
    X, Y = train.get_data(params, datasets)

    # unpack params
    rand_seed = params['rand_seed']
    kernel_reg_const = params['kernel_reg_const']
    num_features = params['num_features']
    q = params['q']

    node_array = params['node_array'].split(',')
    for i in range(len(node_array)):
        node_array[i] = int(node_array[i].strip('[] '))
    node_array = np.array(node_array)

    # rebuild model
    model = models.DNN(num_features, node_array, kernel_reg_const, rand_seed)

    # recreate results dict
    loss, acc, auroc, auroc_sk, y_prob = {}, {}, {}, {}, {}
    for res in [loss, acc, auroc, auroc_sk, y_prob]:
        for dataset in datasets:
            res[dataset] = []

    results = {
        'best_index': 0,
        'loss': loss,
        'acc': acc,
        'auroc': auroc,
        'auroc_sk': auroc_sk,
        'y_prob': y_prob
    }

    # restore graph
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(tf.local_variables_initializer())

        saver = tf.train.Saver()
        saver.restore(sess, weights_filename)

        # evaluate model on all datasets, including score
        train.evaluate_model(X, Y, model, q, results, datasets, sess)
        for dataset in datasets:
            y_prob = sess.run(model.y_prob, feed_dict={model.x: X[dataset]})
            results['y_prob'][dataset] = y_prob

        # plot ROC curve and save results
        train.plot_ROC(X, Y, results, datasets, results_file_base)
        train.save_results(X, params, results, metrics, datasets,
                           results_file_base)

        # compute and plot saliency map
        if saliency:
            saliency_vecs = train.saliency(X, Y, model, sess)
            train.plot_saliency(saliency_vecs, num_features, results_file_base)
Ejemplo n.º 14
0
def run_possibilities(dataset_path, logs_path, possibilities):
    x_train_labeled, x_train_unlabeled, y_train_labeled, x_val, y_val = get_data(
        dataset_path=dataset_path,
        normalization=NORMALIZATION,
        unlabeled_percentage=UNLABELED_PERCENTAGE,
        seed=SEED)
    _, evaluation_mapping, _ = timit.get_phone_mapping()
    n_classes = get_number_of_classes()

    for consistency_loss, schedule, sigma, consistency_scale, stabilization_scale, xi in possibilities:
        hparams = {
            'consistency_loss': consistency_loss,
            'schedule': schedule,
            'sigma': sigma,
            'consistency_scale': consistency_scale,
            'stabilization_scale': stabilization_scale,
            'xi': xi
        }

        for k, v in hparams.items():
            print(f'{k}={v}, ', end='')
        print()

        config = Config(version='mono_directional',
                        n_hidden_layers=N_HIDDEN_LAYERS,
                        n_units=N_UNITS,
                        n_epochs=N_EPOCHS,
                        batch_size=BATCH_SIZE,
                        unlabeled_percentage=UNLABELED_PERCENTAGE,
                        optimizer=OPTIMIZER,
                        consistency_loss=consistency_loss,
                        consistency_scale=consistency_scale,
                        stabilization_scale=stabilization_scale,
                        xi=xi,
                        sigma=sigma,
                        schedule=schedule,
                        schedule_length=SCHEDULE_LENGTH,
                        normalization=NORMALIZATION,
                        seed=SEED)

        logs_path_ = logs_path / str(config)
        if logs_path_.is_dir(
        ):  # skip what already done (e.g. in case of crashes)
            print('already done, skipping...')
            continue
        logs_path_.mkdir(parents=True)
        logs_path_ = str(logs_path_)

        model = DualStudent(n_classes=n_classes,
                            n_hidden_layers=config.n_hidden_layers,
                            n_units=config.n_units,
                            consistency_loss=config.consistency_loss,
                            consistency_scale=config.consistency_scale,
                            stabilization_scale=config.stabilization_scale,
                            xi=config.xi,
                            padding_value=PADDING_VALUE,
                            sigma=config.sigma,
                            schedule=config.schedule,
                            schedule_length=config.schedule_length,
                            version=config.version)

        model.compile(optimizer=get_optimizer(config.optimizer))

        model.train(x_labeled=x_train_labeled,
                    x_unlabeled=x_train_unlabeled,
                    y_labeled=y_train_labeled,
                    n_epochs=config.n_epochs,
                    batch_size=config.batch_size,
                    seed=config.seed)

        results = model.test(x=x_val,
                             y=y_val,
                             batch_size=config.batch_size,
                             evaluation_mapping=evaluation_mapping)

        with tf.summary.create_file_writer(logs_path_).as_default():
            hp.hparams(hparams)
            for k, v in results.items():
                tf.summary.scalar(k, v, step=N_EPOCHS)
Ejemplo n.º 15
0
import os
import urllib.request
import numpy as np

print("reading yaml")
config = yaml.safe_load(open("./config.yml", "r"))
print("config loaded")

print("Let's start")
last_date = datetime.now().date()
print("loading model")
model = pickle.load(open(config["model"], 'rb'))
with open('model.pkl', 'rb') as fin:
    modelCovid = pickle.load(fin)
print("getting data")
df = train.get_data()
print("init Ok")
result = None

mapping_reg_dep = pd.read_csv("mapping_region_dep.csv",
                              dtype={
                                  "region": str,
                                  "dep": str
                              })


def get_risks(request):
    """HTTP Cloud Function.
    Args:
        request (flask.Request): The request object.
        <http://flask.pocoo.org/docs/1.0/api/#flask.Request>
Ejemplo n.º 16
0

from box import Box
from train import get_data



training_data = get_data()
box = Box(training_data)
box_env = box.read_box()  # box environment represented by nested lists
box.show_box(box_env)
Ejemplo n.º 17
0
import tensorflow as tf
from train import get_data
from keras.models import load_model
import numpy as np

if __name__ == "__main__":
    model = load_model('my_model.h5')
    train_X, train_y, val_X, val_y, test_X, test_y = get_data()
    loss, accuracy = model.evaluate(test_X, test_y)
    print(loss, accuracy)
    #for value, prediction in zip(test_y, model.predict(test_X)):
    #    print(value, prediction, np.mean(np.square(value - prediction)))
Ejemplo n.º 18
0
from transformers import BertTokenizer
import argparse
import os
import torch

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--ckpt')
    parser.add_argument('--batch_size', type=int, default=16)
    parser.add_argument('--branch', type=int, default=0)
    parser.add_argument('--task', type=int)
    parser.add_argument('--train_tasks', nargs='+', type=int)
    args = parser.parse_args()

    tokenizer = BertTokenizer.from_pretrained('bert-large-cased')
    test_data = get_data(args.task, 'test')
    test_loader = DataLoader(test_data,
                             args.batch_size,
                             collate_fn=lambda x: collate(tokenizer, x),
                             pin_memory=True)

    model = BertMultiTask([get_n_classes(t) for t in args.train_tasks])
    model.load_state_dict(torch.load(args.ckpt))
    model = model.cuda().eval()

    all_correct = correct = total = 0
    tp = torch.zeros(get_n_classes(args.task))
    fp = torch.zeros(get_n_classes(args.task))
    fn = torch.zeros(get_n_classes(args.task))

    for inputs, labels in tqdm(test_loader):
Ejemplo n.º 19
0
from train import get_data
from train import train
from train import validate_model
from nltk.tokenize import word_tokenize
import re
import string
import io
import csv

with open('dataset.csv') as f:
    reader = csv.reader(f, delimiter=',')
    #Skip first row
    next(reader)
    with open('processed.csv', 'w', encoding='UTF-8') as new_file:
        for row in reader:
            for messages in row[1:-3]:
                # Remove all non alphabetic letters
                encoded_string = messages.encode("ascii", "ignore")
                messages = encoded_string.decode()
                # Call word tokenize for each row of messages
                text = preprocess(word_tokenize(messages))
            for classification in row[:1]:
                classify = classification
            # Write each classification to new proceesed file
            new_file.write(classify + "," + text + "\n")
    new_file.close()
f.close()

X_train, X_test, y_train, y_test = get_data()
model = train(X_train, y_train)
validate_model(model, X_test, y_test)