Ejemplo n.º 1
0
def run_transform(name, data_x, data_y, transformer):
    print("Working on {}...".format(name))

    report_name = "reports/{}_nn_output.txt".format(name)
    sys.stdout = open(report_name, "w")

    #2 transform the data
    transform_x = transformer(data_x, data_y, name)
    plot_corr(name, pd.DataFrame(data=transform_x), data_y)

    kmeans_name = "{} KMeans Clustered".format(name)
    gmm_name = "{} GMM Clustered".format(name)

    #3 cluster the transformed data
    kmeans_clustered = kmeans(kmeans_name, transform_x, data_y)
    gmm_clustered = gmm(gmm_name, transform_x, data_y)

    #4 run neural network on transformed data
    x_train, x_test, y_train, y_test = split_data(transform_x, data_y)
    run_nn(name, x_train, x_test, y_train, y_test)

    #5 call run_nn on cluster from #3 (clustered from dimensionally reduced)
    kmx_train, kmx_test, kmy_train, kmy_test = split_data(
        kmeans_clustered, data_y)
    run_nn(kmeans_name, kmx_train, kmx_test, kmy_train, kmy_test)

    gmmx_train, gmmx_test, gmmy_train, gmmy_test = split_data(
        gmm_clustered, data_y)
    run_nn(gmm_name, gmmx_train, gmmx_test, gmmy_train, gmmy_test)

    sys.stdout = sys.__stdout__

    print("Finished {}!".format(name))
    print()
Ejemplo n.º 2
0
def make_src_tgt_data():
    train_dir = "train.txt"
    eval_dir = "eval.txt"
    test_dir = "test.txt"

    train_data = utils.load_data(train_dir)
    eval_data = utils.load_data(eval_dir)
    test_data = utils.load_data(test_dir)

    train_data = utils.filter_sentences_with_punct(train_data)
    eval_data = utils.filter_sentences_with_punct(eval_data)
    test_data = utils.filter_sentences_with_punct(test_data)

    token_train_data = utils.tokenize_data(train_data)
    token_eval_data = utils.tokenize_data(eval_data)
    token_test_data = utils.tokenize_data(test_data)

    index2word, word2index = utils.build_vocab_with_nltk(
        token_train_data, 10000)

    train_src_data, train_tgt_data = utils.split_data(token_train_data, Limits)
    eval_src_data, eval_tgt_data = utils.split_data(token_eval_data, Limits)
    test_src_data, test_tgt_data = utils.split_data(token_test_data, Limits)

    utils.save_data("train_src_data.txt", train_src_data)
    utils.save_data("train_tgt_data.txt", train_tgt_data)
    utils.save_data("eval_src_data.txt", eval_src_data)
    utils.save_data("eval_tgt_data.txt", eval_tgt_data)
    utils.save_data("test_src_data.txt", test_src_data)
    utils.save_data("test_tgt_data.txt", test_tgt_data)
    utils.save_data("vocab.txt", index2word)
Ejemplo n.º 3
0
def _data(data_pth, split_val=True, verbose=1):
    data = np.load(data_pth, allow_pickle=True)
    x, y = data['x'], data['y']
    x = x[:, :, np.newaxis]
    x_train, y_train, x_test, y_test = split_data(x, y)

    class_weights_dict = calc_class_weights(y_train)

    if split_val:
        x_train, y_train, x_val, y_val = split_data(x_train, y_train)
        y_train = to_one_hot(y_train, dimension=10)
        y_test = to_one_hot(y_test, dimension=10)
        y_val = to_one_hot(y_val, dimension=10)
        if verbose:
            print('\nx_train shape: %s'
                  '\ny_train shape: %s'
                  '\nx_test shape: %s'
                  '\ny_test shape: %s'
                  '\nx_val shape: %s'
                  '\ny_val shape: %s' %
                  (x_train.shape, y_train.shape, x_test.shape, y_test.shape,
                   x_val.shape, y_val.shape))
        return x_train, y_train, x_test, y_test, x_val, y_val, class_weights_dict
    else:
        y_train = to_one_hot(y_train, dimension=10)
        y_test = to_one_hot(y_test, dimension=10)
        if verbose:
            print('\nx_train shape: %s'
                  '\ny_train shape: %s'
                  '\nx_test shape: %s'
                  '\ny_test shape: %s' %
                  (x_train.shape, y_train.shape, x_test.shape, y_test.shape))
        return x_train, y_train, x_test, y_test, class_weights_dict
Ejemplo n.º 4
0
def load_dataset(cfg):
    length_std = float(cfg['length_std'])
    length_mean = float(cfg['length_mean'])
    noise_std = float(cfg['noise_std'])
    length = int(cfg['length'])
    nb_past_steps = int(cfg['nb_past_steps'])
    nb_future_steps = int(cfg['nb_future_steps'])
    train_fraction = float(cfg['train_fraction'])
    test_fraction = float(cfg['test_fraction'])
    valid_fraction = float(cfg['valid_fraction'])

    sequence = generate_sequence(length_std, length_mean, noise_std, length)

    xs, ys = utils.sequence_to_supervised(sequence, nb_past_steps,
                                          nb_future_steps)
    xs = np.expand_dims(xs, axis=2)
    ys = np.expand_dims(ys, axis=1)

    x_train, x_valid, x_test = utils.split_data(xs, train_fraction,
                                                valid_fraction, test_fraction)

    y_train, y_valid, y_test = utils.split_data(ys, train_fraction,
                                                valid_fraction, test_fraction)

    return x_train, y_train, x_valid, y_valid, x_test, y_test
Ejemplo n.º 5
0
def load_data(cfg):
    xml_path        = cfg['xml_path']
    nb_past_steps   = int(cfg['nb_past_steps'])
    nb_future_steps = int(cfg['nb_future_steps'])
    train_fraction  = float(cfg['train_fraction'])
    valid_fraction  = float(cfg['valid_fraction'])
    test_fraction   = float(cfg['test_fraction'])

    xs, ys = load_glucose_data(xml_path, nb_past_steps, nb_future_steps)
    ys = np.expand_dims(ys, axis=1)

    x_train, x_valid, x_test = utils.split_data(xs, train_fraction,
            valid_fraction, test_fraction)
    y_train, y_valid, y_test = utils.split_data(ys, train_fraction,
            valid_fraction, test_fraction)

    # scale data
    scale = float(cfg['scale'])
    x_train *= scale
    y_train *= scale
    x_valid *= scale
    y_valid *= scale
    x_test  *= scale
    y_test  *= scale

    return x_train, y_train, x_valid, y_valid, x_test, y_test
Ejemplo n.º 6
0
def main():
    isconcat = True
    #isconcat =False
    #modelname = 'lambdanet-b512-model.h5'
    modelname = 'lambdanet-b512-l01-model.h5'
    #modelname = 'lambdanet-b512-windows-model.h5'

    #predfilename='lamdbanet-b512-pred.dat' #acc1:76 lambdanet max poolingresult
    #predfilename = 'lambdanet-b512-pred.dat'  # acc1:60 lambdanet avg pooling result
    #predfilename = 'rank-pred.dat'#acc1:65 svm result
    predfilename = 'lambdanet-b512-l01-pred.dat'  #acc1:74 lambdanet max poolingresult
    #predfilename = 'lambdanet-b512-windows-pred.dat'

    stage = 3

    if stage <= 0:
        utils.prepare_data(vocab_size)
    if stage <= 1:
        utils.split_data(n=5)
        #utils.split_data(n=10)
    if stage <= 2:
        #train(n=5,isconcat=isconcat,modelname=modelname)
        train_lambda(n=5, isconcat=isconcat, modelname=modelname)
        #train_lambda(n=10, isconcat=isconcat, modelname=modelname)
    if stage <= 3:
        predict(n=5,
                isconcat=isconcat,
                modelname=modelname,
                predfilename=predfilename)
    if stage <= 4:
        utils.calc_metric(n=5, predfilename=predfilename)
        utils.calc_metric_method(n=5, predfilename=predfilename)
Ejemplo n.º 7
0
def find_best_c(x, y, share):
    x_train, x_check = utils.split_data(x, share)
    y_train, y_check = utils.split_data(y, share)

    best_c = 2**-7
    best_f1 = 0
    for i in range(-7, 7):
        c = 2**i
        v = train(x_train, y_train, c)
        p, r = utils.process_result(test(x_check, y_check, v))
        f1 = utils.f1(p, r)
        if f1 > best_f1:
            best_f1 = f1
            best_c = c
    return best_c
Ejemplo n.º 8
0
def find_best_c(x, y, share):
    x_train, x_check = utils.split_data(x, share)
    y_train, y_check = utils.split_data(y, share)

    best_c = 2 ** -7
    best_f1 = 0
    for i in range(-7, 7):
        c = 2 ** i
        v = train(x_train, y_train, c)
        p, r = utils.process_result(test(x_check, y_check, v))
        f1 = utils.f1(p, r)
        if f1 > best_f1:
            best_f1 = f1
            best_c = c
    return best_c
Ejemplo n.º 9
0
def main(FLAG):
    Model = SimpleModel(FLAG.input_dim,
                        FLAG.hidden_dim,
                        FLAG.output_dim,
                        optimizer=tf.train.RMSPropOptimizer(
                            FLAG.learning_rate))

    image, label = load_dataset()
    image, label = image_augmentation(image,
                                      label,
                                      horizon_flip=True,
                                      control_brightness=True)
    label = label / 96.
    (train_X, train_y), (valid_X,
                         valid_y), (test_X, test_y) = split_data(image, label)

    if FLAG.Mode == "validation":
        lr_list = 10**np.random.uniform(-6, -2, 20)
        Model.validation(train_X, train_y, valid_X, valid_y, lr_list)
    elif FLAG.Mode == "train":
        Model.train(train_X, train_y, valid_X, valid_y, FLAG.batch_size,
                    FLAG.Epoch, FLAG.save_graph, FLAG.save_model)

        pred_Y = Model.predict(test_X[123])
        print(pred_Y)
        print(test_y[123])
        print(np.mean(np.square(pred_Y - test_y[123])))
Ejemplo n.º 10
0
def main():
    # initialise the models
    vmodel = VideoNet().to(device)
    amodel = AudioNet().to(device)
    avmodel = AVNet().to(device)
    vmodel.load_state_dict(torch.load('vmodel_final.pt'))
    amodel.load_state_dict(torch.load('amodel_final.pt'))
    avmodel.load_state_dict(torch.load('avmodel_final.pt'))
    print('loaded model')
    params = list(vmodel.parameters())+list(amodel.parameters())+list(avmodel.parameters())
    # optimiser = optim.Adam(params, lr=LR)
    optimiser = optim.SGD(params, lr=LR, momentum=0.9)

    list_vid = os.listdir('data/train/full_vid')  # ensure no extra files like .DS_Store are present
    train_list, val_list = utils.split_data(list_vid, 0.8, 0.2)
    # log the list for reference
    utils.log_list(train_list, 'data/train_list.txt')
    utils.log_list(val_list, 'data/val_list.txt')
    # uncomment following to read previous list
    # train_list = utils.read_list('data/train_list.txt')
    # val_list = utils.read_list('data/val_list.txt')
    train_list = ['video_001.mp4']
    composed = transforms.Compose([Resize(256), RandomCrop(224)])
    # composed = transforms.Compose([Resize(256)])
    train_loader = torch.utils.data.DataLoader(AVDataset(train_list[:1], transform=composed), batch_size=batch_size, shuffle=False, num_workers=4)
    val_loader = torch.utils.data.DataLoader(AVDataset(train_list[:1], transform=composed), batch_size=batch_size,shuffle=False, num_workers=4)
    l,p,cam=val(vmodel,amodel,avmodel,val_loader)
    print(p,cam.shape)
    import skvideo.io
    vids=skvideo.io.vread('data/train/'+'snippet/video_001.mp4')
    # print('vids',vids)
    findcam(np.expand_dims(vids,0),np.abs(cam.cpu().numpy()))
Ejemplo n.º 11
0
def test1():
    top_n = int(input("Input the number of recommendations\n"))
    k = int(input("Input the number of related users\n"))
    data = utils.get_data()
    train, test = utils.split_data(data, 2, 1, 1)
    del data

    user = int(input("Input the user id \n"))
    print("The train set contains the movies of the user: "******"it takes ", (end_time - start_time).seconds, " seconds to get W")

    start_time = datetime.datetime.now()
    rec = get_recommendation(user, train, top_n, k, weight, related_users)
    end_time = datetime.datetime.now()
    print("it takes ", (end_time - start_time).seconds, " seconds to get recommend for one user")

    print(rec)
    for item in rec:
        print(item),
        if item in test[user]:
            print("  True")
        else:
            print("  False")
def main(_):
    pp.pprint(flags.FLAGS.__flags)

    if not os.path.exists(FLAGS.checkpoint_dir):
        os.makedirs(FLAGS.checkpoint_dir)

    X, y, sentences, index_to_word = utils.load_sentiment_data(
        FLAGS.max_length)
    vocab_size, n_classes = X.shape[2], y.shape[1]
    X_train, y_train, X_test, y_test = utils.split_data(X, y)

    with tf.Session() as sess:
        deep_pdf = SentimentRNN(
            sess,
            vocab_size=vocab_size,
            n_classes=n_classes,
            batch_size=FLAGS.batch_size,
            keep_prob=FLAGS.keep_prob,
            max_length=FLAGS.max_length,
            n_recurrent_layers=FLAGS.n_recurrent_layers,
            n_fc_layers=FLAGS.n_fc_layers,
            recurrent_layer_width=FLAGS.recurrent_layer_width,
            fc_layer_width=FLAGS.fc_layer_width,
            checkpoint_dir=FLAGS.checkpoint_dir,
            epoch=FLAGS.epoch)

        if FLAGS.is_train:
            deep_pdf.train(FLAGS, X_train, y_train, X_test, y_test)
        else:
            deep_pdf.load(FLAGS.checkpoint_dir)
Ejemplo n.º 13
0
def main():
    print("--------- Naive Bayes ---------")
    data_set = utils.read_data('dataset.txt')
    train_data, test_data = utils.split_data(data_set, 0.3)

    prob_matrix, prob_yi = train(train_data)
    test(prob_matrix, prob_yi, test_data)
Ejemplo n.º 14
0
def find_best_c(x, y, share, count):
    x_train, x_check = utils.split_data(x, share)
    y_train, y_check = utils.split_data(y, share)

    best_f1 = 0
    best_c = -1
    c = 10
    while c <= 40:
        w1, w2 = train(x_train, y_train, c, count)
        p, r = utils.process_result(test(x_check, y_check, w1, w2))
        f1 = utils.f1(p, r)
        if f1 > best_f1:
            best_f1 = f1
            best_c = c
        c += 10
    return best_c
Ejemplo n.º 15
0
def parse_coauthor(file):
    """ Parse & convert coauthor into Dataframe.

        Args:
            -- file: coauthor file address, encoding in utf-8.
                coauthor file: https://lfs.aminer.cn/lab-datasets/aminerdataset/AMiner-Coauthor.zip

        ETA 10min
        """
    with open(file, encoding='utf-8') as f:
        data = f.readlines()

    def process(d):
        df = pd.DataFrame(columns=['1st', '2nd', 'num'])
        for c in d:
            c = c.lstrip('#')
            c = c.rstrip('\n')
            df = df.append(
                {col: val
                 for col, val in zip(df.columns, c.split('\t'))},
                ignore_index=True)
        return df

    coauthor_df = multiprocess(process, split_data(data, size=2000))
    coauthor_df['num'] = coauthor_df['num'].astype('int64')
    return coauthor_df
Ejemplo n.º 16
0
def choose_best_vec():
    train_sets_names = ["A", "B", "C", "D"]

    print("Choosing best vectorization method:")
    clf = LinearDiscriminantAnalysis()
    best_score = 0
    best_vec_method = ""

    for vec_method in train_sets_names:
        X_train, y_train, X_test, _ = training_data()
        X_train, X_test = extract_feature(X_train, X_test, vec_method)
        # Split Training set to predefined train and cross validation
        X_t, X_cv, y_t, y_cv, _ = split_data(X_train, y_train)

        model = clf.fit(X_t, y_t)
        y_pred = model.predict(X_cv)
        score = accuracy_score(y_cv, y_pred)

        print("Method:", vec_method, "cv accuracy:", score)

        if score > best_score:
            best_score = score
            best_vec_method = vec_method

    print("Best vectorization method:", best_vec_method, "with score of:",
          best_score)
    return best_vec_method
Ejemplo n.º 17
0
    def __init__(self, env, batch: np.array, V=None, lr=1e-4, pi_eval=None):
        self.env = env
        self.batch = batch

        # for tc
        self.V = V

        if V is not None:
            self.pi = LinearPolicy(V.get_feature_vec_len(), env.action_space.n,
                                   lr)
        else:
            self.pi = LinearPolicy(env.observation_space.shape[0],
                                   env.action_space.n, lr)
        self.pi_eval = pi_eval

        # processing batch of data, generating 1-hot vectors
        data_x, data_y = utils.pre_process_batch(env, None, batch, V)

        self.train_x, self.train_y, _, _ = utils.split_data(data_x,
                                                            data_y,
                                                            ratio=0.0)
        assert len(self.train_x) == len(data_x)

        # generating separate validation set
        #val_batch = utils.generate_batch(self.env, self.pi_eval, 0.05 * len(self.batch))
        #self.test_x, self.test_y = utils.pre_process_batch(env, None, val_batch, V)
        print('Number of training trajs {}'.format(len(self.batch)))
        print('Number of training steps {}'.format(len(self.train_x)))
        #print ('Number of testing trajs {}'.format(len(val_batch)))
        #print ('Number of testing steps {}'.format(len(self.test_x)))

        self.compute_ris_estimates()
Ejemplo n.º 18
0
def prepare_data(dataset_path, vectorization='bow', verbose=False):
    X, y = utils.prepare_xy(dataset_path)

    bow_path = 'app/saves/embeddings/bow.pickle'
    tfidf_path = 'app/saves/embeddings/tfidf.pickle'

    if vectorization == 'bow':
        if os.path.isfile(bow_path):
            X = load_vectors(bow_path)
        else:
            X = vecotrize_bow(X, save_path=bow_path)
    elif vectorization == 'tfidf':
        if os.path.isfile(tfidf_path):
            X = load_vectors(tfidf_path)
        else:
            X = vectorize_tfidf(X, save_path=tfidf_path)
    else:
        raise ValueError('Método não implementado!')

    X_train, X_test, X_val, y_train, y_test, y_val = utils.split_data(X, y)

    if verbose:
        print('Treino')
        print(X_train.shape)
        print('Validação')
        print(X_val.shape)
        print('Teste')
        print(X_test.shape)

    return X_train, X_val, X_test, y_train, y_val, y_test
Ejemplo n.º 19
0
def main(_):
    try:
        os.makedirs(FLAGS.save_dir)
    except:
        pass

    print('Load data')
    full_data = utils.load_age_data(FLAGS.data_dir)

    train_data, val_data = utils.split_data(full_data)
    traingen, valgen = data.AgeDatagen(FLAGS, train_data), data.AgeDatagen(FLAGS, val_data)
    # model = Model(FLAGS, '/cpu:0')

    model = AgeModel(FLAGS)

    with tf.Session() as sess:
        sess.run(tf.initialize_all_variables())

        sess.run(tf.assign(model.lr, FLAGS.learning_rate))

        print('Let the train begin!')
        for epoch in range(FLAGS.epochs):
            sess.run(tf.assign(model.lr, FLAGS.learning_rate))
            FLAGS.learning_rate *= FLAGS.decay_rate

            pbar = tqdm(range(FLAGS.steps))
            for _ in pbar:
                x, y = next(traingen)
                loss, _, = sess.run([model.loss, model.train_op], {model.inputs: x, model.targets: y})
                pbar.set_description("loss: {:.2f}, ".format(loss))
Ejemplo n.º 20
0
def test_nn_framework():
    # Input: (M x T x B) matrix representing blocks in all ASTs
    all_matrix = utils.load_data("./data-created/q4_array_of_ast_matrices.npy")
    validation_matrix = utils.create_validation_matrix(all_matrix)
    print(all_matrix.shape)
    print(validation_matrix.shape)

    # Split into training/dev/test set
    train_matrix, dev_matrix, test_matrix = utils.split_data(all_matrix)
    print(train_matrix.shape)
    print(dev_matrix.shape)
    print(test_matrix.shape)

    # Create model
    num_timesteps = all_matrix.shape[1]
    num_blocks = all_matrix.shape[2]
    model = create_model(num_timesteps, num_blocks)

    # train_matrix model
    #train_matrix = all_matrix[0, :, :]
    #train_labels = validation_matrix[0, :, :]
    #train_model(train_matrix, train_labels)
    train_model(model, all_matrix, validation_matrix)

    print(
        utils.accuracy_from_onehot_matrix(
            model.predict(test_matrix),
            utils.create_validation_matrix(test_matrix)))
Ejemplo n.º 21
0
def item_mean_test(ratings, min_num_ratings, verbose=False, p_test=0.1):
    """
    Splits the data set in train and test and compute the RMSE using as prediction the item mean.
    :param ratings: initial data set (sparse matrix of size nxp, n items and p users)
    :param min_num_ratings: all users and items must have at least min_num_ratings per user and per item to be kept
    :param verbose: True if user wants details to be printed
    :param p_test share of the data set to be dedicated to test set
    :return: RMSE value of the prediction using item means as a predictions
b    """
    _, train, test = split_data(ratings,
                                min_num_ratings,
                                verbose=verbose,
                                p_test=p_test)
    cumulated_rmse = 0

    # find the RMSE share due to all users
    for item in range(train.shape[0]):

        # compute the mean of non-zero rating for current user
        current_train_ratings = train[item]
        current_non_zero_train_ratings = current_train_ratings[
            current_train_ratings.nonzero()]

        if current_non_zero_train_ratings.shape[1] != 0:
            mean = current_non_zero_train_ratings.mean()
            # compute the rmse with all non-zero ratings of current user in test set
            current_test_ratings = test[item]
            current_non_zero_test_ratings = current_test_ratings[
                current_test_ratings.nonzero()].todense()
            cumulated_rmse += calculate_mse(current_non_zero_test_ratings,
                                            mean)

    cumulated_rmse = np.sqrt(float(cumulated_rmse) / test.nnz)

    return cumulated_rmse
Ejemplo n.º 22
0
def grade1():
    print("=" * 20 + "Grading Problem 1" + "=" * 20)
    marks = 0.0
    try:
        X, Y = utils.load_data2('data2.csv')
        X, Y = utils.preprocess(X, Y)
        X_train, Y_train, X_test, Y_test = utils.split_data(X, Y)
        W, train_mses, test_mses = p1.ista(X_train,
                                           Y_train,
                                           X_test,
                                           Y_test,
                                           _lambda=0.1)
        assert train_mses[-1] < 0.2

        marks += 1.5
    except:
        print('Train Error is large')

    try:
        assert test_mses[-1] < 0.25
        marks += 1.5
    except:
        print('Test Error is large')
    print("Marks obtained in Problem 1: ", marks)
    return marks
Ejemplo n.º 23
0
def main():
    [data, labels] = load_data()
    [x_train, y_train, x_test, y_test] = split_data(data, labels)
    val_labels = y_test

    x_train = np.array(x_train)
    x_train = 255 - x_train
    x_train /= 255
    x_test = np.array(x_test)
    x_test = 255 - x_test
    x_test /= 255

    # multi-convnet model
    y_train = convert_to_multi_output_target(y_train)
    y_test = convert_to_multi_output_target(y_test)
    model = train(x_train, y_train, x_test, y_test)

    # convnet model
    #y_train = convert_to_general_target(y_train)
    #y_test = convert_to_general_target(y_test)
    #model = train(x_train, y_train, x_test, y_test, default_model='convnet')

    # pretrain model
    #model = load_model('my_model.h5')
    pred(model, x_test, val_labels)
Ejemplo n.º 24
0
def find_best_c(x, y, share, count):
    x_train, x_check = utils.split_data(x, share)
    y_train, y_check = utils.split_data(y, share)

    best_f1 = 0
    best_c = -1
    c = 10
    while c <= 40:
        w1, w2 = train(x_train, y_train, c, count)
        p, r = utils.process_result(test(x_check, y_check, w1, w2))
        f1 = utils.f1(p, r)
        if f1 > best_f1:
            best_f1 = f1
            best_c = c
        c += 10
    return best_c
Ejemplo n.º 25
0
def main():
    df_train, df_evaluate = read_challenge_data()
    df_train = process_and_filter_data(df_train, config)
    df_evaluate = process_evaluate_data(df_evaluate, config)

    X_train, X_test, y_train, y_test = split_data(df_train, config)

    run_baseline(X_train, X_test, y_train, y_test, config)

    gb_clf, y_pred_gb = run_gradient_boosting_classifier(
        X_train, X_test, y_train, y_test, config)
    rf_clf, y_pred_rf = run_random_forest(X_train, X_test, y_train, y_test,
                                          config)
    xgb_clf, y_pred_xgb = run_xgboost(X_train, X_test, y_train, y_test, config)

    # Voting classifier.
    voting_clf = VotingClassifier(estimators=[("rf", rf_clf), ("gb", gb_clf),
                                              ("xgb", xgb_clf)],
                                  voting="soft").fit(X_train, y_train)
    stratified_shuffle_split = StratifiedKFold(n_splits=10)
    cross_val_score_ = cross_val_score(voting_clf,
                                       X_train,
                                       y_train,
                                       cv=stratified_shuffle_split).mean()
    LOGGER.info(
        f"Voting classifier cross validation score: {cross_val_score_}")
    if config["test"]:
        print(classification_report(voting_clf.predict(X_test), y_test))

    final_model = voting_clf
    final_predictions = final_model.predict(df_evaluate)

    pd.DataFrame(final_predictions).to_csv("data/101617.txt",
                                           index=False,
                                           header=False)
Ejemplo n.º 26
0
def grade1():
    marks = 0

    try:
        X = np.random.rand(110, 5)
        Y = np.random.rand(110, 1)
        X_train, Y_train, X_test, Y_test = utils.split_data(X, Y, 0.75)
        assert np.allclose(np.vstack([X_train, X_test]), X)
        assert np.allclose(np.vstack([Y_train, Y_test]), Y)
        assert len(X_train) == 82 and len(Y_train) == 82
        marks += 0.5
    except:
        print('Q1 split_data() incorrect', file=stderr)
        return marks

    try:
        x = np.array([
            9.71711545, 5.27658861, 0.74957658, 7.25267862, 1.57512235,
            4.95493874, 4.6645458, 8.81014817, 5.6875507, 8.9270358
        ]).reshape(10, 1)
        y = np.array([
            7.4395211, 1.29711056, 4.99824035, 1.87706798, 0.93306619,
            6.65645683, 8.6573449, 2.54946024, 1.3023241, 6.52289899
        ]).reshape(10, 1)
        w = 0.513244
        b = 1.839345
        assert np.isclose(single_var_reg.mse(x, y, w, b), 4.319008411331635)
        marks += 0.5
    except:
        print('Q1 mse() incorrect', file=stderr)
        return marks

    try:
        X, Y = utils.load_data1('data1.csv')
        X_train, Y_train, X_test, Y_test = utils.split_data(X, Y)
        w, b, train_mses, test_mses = single_var_reg.ordinary_least_squares(
            X_train, Y_train, X_test, Y_test)
        assert train_mses[-1] < 52
        assert test_mses[-1] < 68
        for i in range(len(train_mses) - 1):
            assert train_mses[i] >= train_mses[i + 1]
        marks += 3
    except:
        print('Q1 ordinary_least_squares() incorrect', file=stderr)
        return marks

    return marks
Ejemplo n.º 27
0
def main():
    # parse arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('--emb_file', default='data/giga5_glv.pkl', type=str)
    parser.add_argument('--corpus_file',
                        default='data/semcor_corpus.pkl',
                        type=str)
    parser.add_argument('--load_corpus_from_file', default=True, type=bool)
    parser.add_argument('--model_name', default='AverageLinear', type=str)
    parser.add_argument('--context_size', default=1, type=int)
    parser.add_argument('--lr', default=0.1, type=float)
    parser.add_argument('--max_epochs', default=5, type=int)
    parser.add_argument('--clip_grad', default=5.0, type=float)
    parser.add_argument('--batch_size', default=16, type=int)
    parser.add_argument('--val_iter', default=500, type=int)
    parser.add_argument('--print_iter', default=200, type=int)
    parser.add_argument('--hidden_size', default=50, type=int)
    args = parser.parse_args()

    # set random seeds
    #np.random.seed(100)
    #torch.manual_seed(10)

    # get the data
    if args.load_corpus_from_file and os.path.exists(args.corpus_file):
        # load the data from file if it already exists
        with open(args.corpus_file, 'rb') as cf:
            dataset = pickle.load(cf)
            print("Loaded corpus from {}!".format(args.corpus_file))
    else:
        dataset = SemCor(args.context_size)
        print("Parsed corpus!")
        # save the corpus for next time
        with open(args.corpus_file, 'wb') as cf:
            pickle.dump(dataset, cf)
            print("Saved corpus to {} !".format(args.corpus_file))

    # split up the data
    train_data, val_data, test_data = split_data(dataset)

    # load embeddings from file
    emb_weight_matrix_df = pd.read_pickle(args.emb_file)
    emb_weight_matrix = torch.tensor(emb_weight_matrix_df.values)

    # make the model
    if args.model_name == "AverageLinear":
        model = AverageLinear(dataset.max_num_senses, emb_weight_matrix,
                              emb_weight_matrix_df)
    if args.model_name == "LSTMEncoder":
        model = LSTMEncoder(dataset.max_num_senses, args.hidden_size,
                            emb_weight_matrix, emb_weight_matrix_df)
    else:
        raise Exception("Invalid model name: {}".format(args.model_name))

    # run the model
    train(model, train_data, val_data, args)
    test(model, test_data)
 def eval_on_dataset(self, dataset, is_dataset_csv=False):
     if is_dataset_csv:
         dataset = utils.load_data_from_csv(dataset, self.use_cols)
     # Split data to features and labels
     x, y_true = utils.split_data(dataset, self.y_col)
     # Classify data
     y_pred = self.predict(x)
     # Evaluate predictions
     return utils.eval_predictions(y_true, y_pred)
Ejemplo n.º 29
0
def sub_split(args):
    ts = args.testsize
    if 0 < ts < 1:
        data_train, data_test, names = ul.split_data(args.file, ts)
        header = ','.join(names)
        ul.write_array(f"{1-ts}_{args.output}", data_train, header=header)
        ul.write_array(f"{ts}_{args.output}", data_test, header=header)
    else:
        print("error")
Ejemplo n.º 30
0
    def validate(self):
        # Validate model
        train_x, y = self.load_train()
        train_x, test_x, train_y, test_y = u.split_data(train_x, y)

        self.fit(train_x, train_y)

        print("Validating")
        preds = self.compute_predict(test_x)
        print(metrics.classification_report(test_y, preds))
Ejemplo n.º 31
0
def prepare_quries_answers(args):
  chat_data = utils.load_data(args.data_dir)
  chat_data = utils.filter_sentences(chat_data, args.whitelist)
  index2word, word2index = utils.build_vocab(chat_data, max_words=args.max_words)
  Limits.q_max_len, Limits.a_max_len, Limits.q_min_len, Limits.a_min_len = args.q_max_len, \
      args.a_max_len, args.q_min_len, args.a_min_len
  queries, answers = utils.split_data(chat_data, Limits)
  queries, answers = utils.vectorize(queries,  answers, word2index, sort_by_len=True)

  return queries, answers, index2word, word2index
Ejemplo n.º 32
0
def main():
    # Extract list of stopwords
    with open(conf.STOPWORDS_FILE, 'r') as f:
        stopword_list = f.read()
        sw = set([w.strip() for w in stopword_list.split()])

    # Split data into ham and spam folders
    train_data_path = os.path.abspath(os.path.join(conf.TRAIN_DIR))
    train_file_path = os.path.abspath(os.path.join(conf.TRAIN_FILE))
    utils.split_data(train_file_path, train_data_path)

    # Process training data and prepare sets of (features, label) data
    spam_path = os.path.join(train_data_path, '0')  # label 0 for spam
    ham_path = os.path.join(train_data_path, '1')  # label 1 for ham
    spam_mails = utils.get_dir_data(spam_path)
    ham_mails = utils.get_dir_data(ham_path)
    spam_set, ham_set = process_train_data(spam_mails, ham_mails, stopwords=sw)

    # 5 Fold Cross Validation with training data to report result metrics
    precision, recall, F1, ham_mails_accuracy, spam_mails_accuracy = \
    get_matrix(spam_set, ham_set, conf.NUM_FOLDS)
    print "Precision : %.4f" % precision
    print "Recall : %.4f" % recall
    print "F1 : %.4f" % F1
    print "Spam Mails Accuracy : %.2f" % spam_mails_accuracy
    print "Ham Mails Accuracy : %.2f" % ham_mails_accuracy

    # Model training on 100% train data
    train_set = spam_set + ham_set
    classifier = NaiveBayesClassifier.train(train_set)

    # Top 20 informative features
    print classifier.show_most_informative_features(20)

    # Classify on given test data
    test_data_path = os.path.abspath(os.path.join(conf.TEST_DIR))
    output_dir_path = os.path.abspath(os.path.join(conf.OUTPUT_DIR))
    if not os.path.exists(output_dir_path):
        os.makedirs(output_path)
    output_file_path = os.path.join(output_dir_path, conf.OUTPUT_FILE)
    test_mails = utils.get_dir_data_with_filename(test_data_path)
    utils.write_file(output_file_path,
                     classify_data(classifier, test_mails, stopwords=sw))
Ejemplo n.º 33
0
def main():    	
	# Extract list of stopwords
	with open(conf.STOPWORDS_FILE, 'r') as f:
		stopword_list = f.read()
		sw = set([w.strip() for w in stopword_list.split()])
		
	# Split data into ham and spam folders
	train_data_path = os.path.abspath(os.path.join(conf.TRAIN_DIR))
	train_file_path = os.path.abspath(os.path.join(conf.TRAIN_FILE))
	utils.split_data(train_file_path, train_data_path)

	# Process training data and prepare sets of (features, label) data
	spam_path = os.path.join(train_data_path, '0')     # label 0 for spam
	ham_path = os.path.join(train_data_path, '1') 	  # label 1 for ham
	spam_mails = utils.get_dir_data(spam_path)
	ham_mails = utils.get_dir_data(ham_path)	
	spam_set, ham_set = process_train_data(spam_mails, ham_mails, stopwords = sw)

	# 5 Fold Cross Validation with training data to report result metrics
	precision, recall, F1, ham_mails_accuracy, spam_mails_accuracy = \
	get_matrix(spam_set, ham_set, conf.NUM_FOLDS)
	print "Precision : %.4f" % precision 
	print "Recall : %.4f" % recall
	print "F1 : %.4f" % F1
	print "Spam Mails Accuracy : %.2f" % spam_mails_accuracy 
	print "Ham Mails Accuracy : %.2f" % ham_mails_accuracy

	# Model training on 100% train data
	train_set = spam_set + ham_set
	classifier = NaiveBayesClassifier.train(train_set)

	# Top 20 informative features
	print classifier.show_most_informative_features(20)

	# Classify on given test data
	test_data_path = os.path.abspath(os.path.join(conf.TEST_DIR))
	output_dir_path = os.path.abspath(os.path.join(conf.OUTPUT_DIR))	
	if not os.path.exists(output_dir_path):
		os.makedirs(output_path)
	output_file_path = os.path.join(output_dir_path, conf.OUTPUT_FILE)
	test_mails = utils.get_dir_data_with_filename(test_data_path)
	utils.write_file(output_file_path, classify_data(classifier, test_mails, stopwords = sw))
Ejemplo n.º 34
0
train_size = train_data[0].shape[0]
test_size = test_data[0].shape[0]
num_feas = len(utils.FIELD_SIZES)

min_round = 1
num_round = 200
early_stop_round = 5
batch_size = 1024

field_sizes = utils.FIELD_SIZES
field_offsets = utils.FIELD_OFFSETS

algo = 'pnn1'

if algo in {'fnn', 'ccpm', 'pnn1', 'pnn2'}:
    train_data = utils.split_data(train_data)
    test_data = utils.split_data(test_data)
    tmp = []
    for x in field_sizes:
        if x > 0:
            tmp.append(x)
    field_sizes = tmp
    print('remove empty fields', field_sizes)

if algo == 'lr':
    lr_params = {
        'input_dim': input_dim,
        'opt_algo': 'gd',
        'learning_rate': 0.1,
        'l2_weight': 0,
        'random_seed': 0
Ejemplo n.º 35
0
def init_data(share):
    data_lines = open('wdbc.data').readlines()
    data = [x.split(',') for x in data_lines]
    # numpy.random.shuffle(data)
    return utils.split_data(data, share)
Ejemplo n.º 36
0
# parameters
batch_size = 32
num_epochs = 1000
training_split = .8
do_random_crop = False
num_classes = 2
dataset_name = 'imdb'
input_shape = (48, 48, 3)
images_path = '../datasets/imdb_crop/'
log_file_path = 'log_files/gender_training.log'
trained_models_path = '../trained_models/gender_models/simple_CNN'

# data loader
data_loader = DataLoader(dataset_name)
ground_truth_data = data_loader.get_data()
train_keys, val_keys = split_data(ground_truth_data, training_split)
image_generator = ImageGenerator(ground_truth_data, batch_size,
                                input_shape[:2],
                                train_keys, val_keys, None,
                                path_prefix=images_path,
                                vertical_flip_probability=0,
                                do_random_crop=do_random_crop)

# model parameters/compilation
model = simple_CNN(input_shape, num_classes)
model.compile(optimizer='adam',
            loss='categorical_crossentropy',
            metrics=['accuracy'])
model.summary()

# model callbacks
    reg_term = sum(T.sum(p ** 2) for p in params)

    def loss(y, t):
        return loss_function(y, t) + lambda_reg * reg_term

    return nn.objectives.Objective(l_out, loss_function=loss)


train_labels = p.read_csv(os.path.join(base_dir, 'data/trainLabels.csv'))
labels_split = p.DataFrame(list(train_labels.image.str.split('_')),
                           columns=['id', 'eye'])
labels_split['level'] = train_labels.level
labels_split['id'] = labels_split['id'].astype('int')

id_train, y_train, id_valid, y_valid = split_data(train_labels, labels_split,
                                                  valid_size=10,
                                                  SEED=SEED, pairs=True)

# Change train dataset to oversample other labels.
# Total sizes:
# (       image
#  level
#  0      25810
#  1       2443
#  2       5292
#  3        873
#  4        708,           image
#  level
#  0      0.734783
#  1      0.069550
#  2      0.150658