Python extract_data Examples, preprocess.extract_data Python Examples

Example #1

0

Show file

def main():
    args = get_args()
    logging.info("Extracting domains and evaluation results for all models")
    rescaled_domain_lst, domain_name_lst, eval_lst, BEST, WORST = \
        preprocess.extract_data(args.modeldir, args.architecture, args.rnn_cell_type, args.metric, args.best)
    embedding_lst = get_embedding(args, rescaled_domain_lst, domain_name_lst, eval_lst)

    kernel = get_kernel.get_kernel(args.architecture, args.embedding, args.embedding_distance, args.kernel, 
                                   domain_name_lst, len(embedding_lst[0]), args.weight)
    if args.embedding == "mds":
        D = kernel.get_value(embedding_lst)
        D = np.exp(-np.array(D))
        embedding_lst, S = get_mds_embedding.mds(D)
        kernel = get_kernel.get_kernel(args.architecture, "origin", args.embedding_distance, "logsquared", 
                                       domain_name_lst, len(embedding_lst[0]), args.weight)
    
    logging.info("kernel: ", args.kernel)
    logging.info(kernel.get_value(np.atleast_2d(rescaled_domain_lst)))

    def objective_function(x):
        '''
        Map a sampled domain to evaluation results returned from the model
        :param x: domain sampled from bayesian optimization
        :return: the corresponding evaluation result
        '''
        for i in range(len(embedding_lst)):
            if (x == embedding_lst[i]).all():
                return eval_lst[i]

    results, best_ind, fix_budget, close_best = run_bayesian_optimization(args, kernel, objective_function, embedding_lst, eval_lst, BEST, WORST)
    write_results(args, results, best_ind, fix_budget, close_best)

Example #2

0

Show file

def main():
    args = get_args()

    # 0. Prepare data
    logger.info("Extracting data ...")
    rescaled_domain_lst, domain_name_lst, eval_lst, BEST, WORST = \
        preprocess.extract_data(args.modeldir, args.architecture, args.rnn_cell_type, args.metric, args.best)
    logger.info("Best point: {0}".format(BEST))
    X = np.array(rescaled_domain_lst)
    Y = np.array(eval_lst)

    best_ind = []
    fix_budget = []
    close_best = []

    for nr in range(args.num_run):
        # 1. setup logger
        logdir = args.output + "_log"
        if not os.path.isdir(logdir):
            os.mkdir(logdir)
        file_handler = logging.FileHandler(logdir + "/" + str(nr) + ".log")
        file_handler.setLevel(logging.INFO)
        logger.addHandler(file_handler)

        # 2. Initialization
        if nr < X.shape[0] - 2:
            start = nr
            end = nr + 3
        else:
            start = nr % (X.shape[0] - 2)
            end = nr % (X.shape[0] - 2) + 3
        ind_label = np.arange(start, end)
        num_label = 3

        # 3. Find the best label
        logger.info("Building the graph ...")
        graph_obj = graph.Graph(args.model, X, Y, args.distance, args.sparsity,
                                logger, domain_name_lst, args.distance_param,
                                args.k, ind_label, num_label)
        num_update = 1
        while True:
            logger.info("###############################")
            logger.info("Update # {0}: ".format(num_update))
            graph_obj.update()
            if BEST in graph_obj.y_label:
                logger.info(
                    "Found the best configuration at update # {0}".format(
                        num_update))
                break
            num_update += 1

        # 4. Write stats
        best_ind.append(graph_obj.y_label.shape[0] - 3)
        fix_budget.append(abs(max(graph_obj.y_label[:args.budget]) - BEST))
        for i in range(graph_obj.y_label.shape[0]):
            if abs(graph_obj.y_label[i] - BEST) <= args.dif:
                close_best.append(i + 1)
                break
        write_results(args, best_ind, fix_budget, close_best, graph_obj)

Example #3

0

Show file

    parser.add_argument('--kernel', default="expsquared", type=str, choices=["constant", "polynomial", "linear",
                        "dotproduct", "exp", "expsquared", "matern32", "matern52",
                        "rationalquadratic", "expsine2", "heuristic"],
                        help='Specify the kernel for Gaussian process.')

    # Other arguments
    parser.add_argument("--threshold", type=int, default=5, help="Remove samples with bad performance(BLEU).")
    parser.add_argument("--output", help="Output directory.")

    return parser.parse_args()

if __name__ == '__main__':
    args = get_args()
    modeldir = "/export/a10/kduh/p/mt/gridsearch/" + args.dataset + "/models/"

    x, y, _ = extract_data(modeldir=modeldir, threshold=args.threshold,
                           architecture=args.architecture, rnn_cell_type=args.rnn_cell_type)
    y = -y

    lower = np.zeros((x.shape[1]))
    upper = np.ones((x.shape[1]))
    if args.architecture == "trans":
        domain_name_lst = ['num_layers', 'transformer_attention_heads', 'transformer_feed_forward_num_hidden',
        'transformer_model_size', 'bpe_symbols', 'initial_learning_rate', 'num_embed']
    else:
        domain_name_lst = []
    kernel = get_kernel.get_kernel(args.architecture, args.kernel, domain_name_lst, x.shape[1])

    def objective_function(sample):
        '''
        Map a sampled domain to evaluation results returned from the model
        :param x_sample: domain sampled from bayesian optimization

Example #4

0

Show file

import numpy as np
import preprocess as pp
from keras.models import model_from_json

question1, question2 = pp.extract_data("quora-question-pairs/test.csv", 'test')
question1_word_sequences, question2_word_sequences, word_index = pp.tokenize(
    question1, question2)
embeddings_index = pp.get_embeddings("glove.840B.300d/glove.840B.300d.txt")
nb_words, word_embedding_matrix = pp.get_embedding_matrix(
    word_index, embeddings_index)
q1_data, q2_data, word_embedding_matrix, nb_words = pp.process_data(
    question1_word_sequences, question2_word_sequences, word_embedding_matrix,
    nb_words, 'test')

X_train = np.stack((q1_data, q2_data), axis=1)
Q1_train = X_train[:, 0]
Q2_train = X_train[:, 1]

json_file = open('best_weights/model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
model = model_from_json(loaded_model_json)
model.load_weights("best_weights/weights.h5")
print("Loaded model from disk")

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
score = model.predict([Q1_train, Q2_train])
print(score)

Example #5

0

Show file

File: mean_squared_error.py Project: Este1le/Auto-tuning

import sys
sys.path.insert(1,
                '/export/a08/xzhan138/Auto-tuning/multi-objective/regressor')
sys.path.insert(1, '/export/a08/xzhan138/Auto-tuning/multi-objective')
import pickle
import numpy as np
import random
from gp import GP
from krr import KRR
from gbssl import GBSSL
from preprocess import extract_data

if __name__ == "__main__":
    modeldir = "/export/a10/kduh/p/mt/gridsearch/robust19-ja-en/models/"
    x, y, _ = extract_data(modeldir, 5)
    output = "/export/a08/xzhan138/Auto-tuning/diagnosis_output/mse{0}.pkl"
    models = [KRR, GP, GBSSL]
    for i in random.sample(range(len(y) - 3), 5):
        print("init: {0}".format(i))
        mse_dic = {}
        for m in range(len(models)):
            mse = []
            label_ids = np.array([i, i + 1, i + 2])
            model = models[m]
            print("model: {0}".format(model.__name__))
            while len(label_ids) != len(y):
                opt_model = model(x, y[label_ids], label_ids)
                y_preds, y_vars = opt_model.fit_predict()
                del opt_model

                unlabel_ids = np.array(

Example #6

0

Show file

    return best - current_best

if __name__ == "__main__":
    architectures = ["rnn", "trans"]
    evaluations = [s2best, s2close_best, ns2dif_best]
    input = '/export/a08/xzhan138/Auto-tuning/single_output/'

    for architecture in architectures:
        arch_dir = input + architecture
        if os.path.isdir(arch_dir):
            for dataset_name in os.listdir(arch_dir):
                dataset_dir = arch_dir + "/" + dataset_name
                if os.path.isdir(dataset_dir):
                    evals = defaultdict(list)
                    modeldir = "/export/a10/kduh/p/mt/gridsearch/" + dataset_name + "/models/"
                    for pf in os.listdir(dataset_dir):
                        if "gru" in pf:
                            x,y,_ = extract_data(modeldir, 5, architecture, "gru")
                        else:
                            x,y,_ = extract_data(modeldir, 5, architecture)
                        best_id = np.argmax(y)

                        if pf.endswith("pkl"):
                            with open(pf, 'rb') as f:
                                I = pickle.load(f)
                            for eval in evaluations:
                                I_eval = np.apply_along_axis(eval, 1, I, best_id=best_id, y=y)
                                evals[pf[:-4]].append((np.average(I_eval), np.std(I_eval)))
                    with open(dataset_dir + "/eval.pkl", 'wb') as f:
                        pickle.dump(evals)

Example #7

0

Show file

import classifier

from preprocess import extract_data

timesteps = 140
clf = classifier.Classifier(timesteps)

train_file = '../../tsv/train.tsv'
test_file = '../../tsv/test.tsv'

X_train, y_train = extract_data(train_file, timesteps)
X_test, y_test = extract_data(test_file, timesteps)

clf.fit_classif(X_train, y_train)
clf.evaluate(X_test, y_test)

Example #8

0

Show file

def main():
    # load and preprocess data
    train_labels, train_imgs = extract_data(config.train_path)
    test_labels, test_imgs = extract_data(config.test_path)
    f = open(config.output_file, 'w')

    # model selection
    if config.select_model is True:
        print("Selecting model...")
        f.write("Scores for model selection:\n")
        # original image
        plain = True  # set parameters for feature extraction
        pool = {'take': False, 'class': 'max'}
        hist = {'take': False, 'h': [4], 'w': [4]}
        grad = {'take': False, 'class': 'hist'}
        chain = {'take': False, 'class': 'hist'}
        select_feats1 = get_feats(train_imgs, plain, pool, hist, grad, chain)

        # feature vector
        pool = {'take': False, 'class': 'max'}
        hist = {'take': True, 'h': [4], 'w': [4]}
        grad = {'take': True, 'class': 'hist'}
        chain = {'take': True, 'class': 'hist'}
        select_feats2 = get_feats(train_imgs, plain, pool, hist, grad, chain)

        # get cross-validation scores
        f.write("Baseline (original image):" + '\n')
        print("logistic regression models:")
        scores = cross_valid(config.models_select1, select_feats1,
                             train_labels)
        print(scores)
        for i, s in enumerate(scores):
            f.write(config.names1[i] + ':' + str(scores[i]) + '\n')
        print("multi-class logistic regression models:")
        scores = cross_valid(config.models_select2, select_feats1,
                             train_labels)
        print(scores)
        for i, s in enumerate(scores):
            f.write(config.names2[i] + ':' + str(scores[i]) + '\n')
        print("k-nearest neighbour models:")
        scores = cross_valid(config.models_select3, select_feats1,
                             train_labels)
        print(scores)
        for i, s in enumerate(scores):
            f.write(config.names3[i] + ':' + str(scores[i]) + '\n')

        f.write("\nFeature vector:" + '\n')
        scores = cross_valid(config.models_select1, select_feats2,
                             train_labels)
        print(scores)
        for i, s in enumerate(scores):
            f.write(config.names1[i] + ':' + str(scores[i]) + '\n')
        print("multi-class logistic regression models:")
        scores = cross_valid(config.models_select2, select_feats2,
                             train_labels)
        print(scores)
        for i, s in enumerate(scores):
            f.write(config.names2[i] + ':' + str(scores[i]) + '\n')
        print("k-nearest neighbour models:")
        scores = cross_valid(config.models_select3, select_feats2,
                             train_labels)
        print(scores)
        for i, s in enumerate(scores):
            f.write(config.names3[i] + ':' + str(scores[i]) + '\n')

        f.write("\n######################\n\n")

    # feature selection
    if config.select_feature is True:
        print("Selecting features...")
        f.write("Scores for feature selection:\n")
        plain = True  # set parameters for feature extraction
        pool = {'take': False, 'class': 'max'}
        hist = {'take': False, 'h': [4], 'w': [4]}
        grad = {'take': False, 'class': 'hist'}
        chain = {'take': False, 'class': 'hist'}

        # histogram
        hist['take'] = True
        print("Extract histogram from training data set...")
        f.write("\nHistogram:\n")
        select_feats = get_feats(train_imgs, plain, pool, hist, grad, chain)
        scores = cross_valid(config.models, select_feats, train_labels)
        print(scores)
        for i, s in enumerate(scores):
            f.write(config.names[i] + ':' + str(scores[i]) + '\n')

        # gradient histogram
        hist['take'] = False
        grad['take'] = True
        print("Extract gradient histogram from training data set...")
        f.write("\nGradient histogram:\n")
        select_feats = get_feats(train_imgs, plain, pool, hist, grad, chain)
        scores = cross_valid(config.models, select_feats, train_labels)
        print(scores)
        for i, s in enumerate(scores):
            f.write(config.names[i] + ':' + str(scores[i]) + '\n')

        # gradient image
        grad['class'] = 'plain'
        print("Extract gradient image from training data set...")
        f.write("\nGradient image:\n")
        select_feats = get_feats(train_imgs, plain, pool, hist, grad, chain)
        scores = cross_valid(config.models, select_feats, train_labels)
        print(scores)
        for i, s in enumerate(scores):
            f.write(config.names[i] + ':' + str(scores[i]) + '\n')

        # chain code histogram
        grad['take'] = False
        chain['take'] = True
        print("Extract chain code histogram from training data set...")
        f.write("\nChain code histogram:\n")
        select_feats = get_feats(train_imgs, plain, pool, hist, grad, chain)
        scores = cross_valid(config.models, select_feats, train_labels)
        print(scores)
        for i, s in enumerate(scores):
            f.write(config.names[i] + ':' + str(scores[i]) + '\n')

        f.write("\n######################\n\n")

    if config.produce_results is True or config.draw_ROC is True:
        # feature extraction
        print("Extract feature from training data set...")
        train_feats = get_feats(train_imgs, config.plain, config.pool,
                                config.hist, config.grad, config.chain)
        print("Extract feature from testing data set...")
        test_feats = get_feats(test_imgs, config.plain, config.pool,
                               config.hist, config.grad, config.chain)
        print("All data processed. Number of features extracted is " +
              str(len(train_feats[0])))

        if config.produce_results is True:
            print("Producing prediction results...")
            f.write("Prediction results\n")
            f.write('original image: ' + str(config.plain))
            f.write('\n')
            f.write('pooled:' + str(config.pool))
            f.write('\n')
            f.write('histogram:' + str(config.hist))
            f.write('\n')
            f.write('gradient:' + str(config.grad))
            f.write('\n')
            f.write('chain code:' + str(config.chain))
            f.write('\n\n')
            all_preds = final_result(config.models, config.names, train_feats,
                                     train_labels, test_feats, test_labels, f)

            if config.visualize_error is True:
                preds = all_preds[2]
                err_imgs = test_imgs[preds != test_labels]
                err_labels = test_labels[preds != test_labels]
                visualize(err_labels, err_imgs)

        if config.draw_ROC is True:
            print("Drawing ROC for LDA model...")
            preds_proba = plot_ROC(config.lda, train_feats, train_labels,
                                   test_feats, test_labels)

Example #9

0

Show file

import numpy as np
import preprocess as pp
from keras.models import *
from keras_model import model

question1, question2, is_duplicate = pp.extract_data(
    "quora-question-pairs/train.csv", 'train')
question1_word_sequences, question2_word_sequences, word_index = pp.tokenize(
    question1, question2)
embeddings_index = pp.get_embeddings("glove.840B.300d/glove.840B.300d.txt")
nb_words, word_embedding_matrix = pp.get_embedding_matrix(
    word_index, embeddings_index)
q1_data, q2_data, labels, word_embedding_matrix, nb_words = pp.process_data(
    question1_word_sequences, question2_word_sequences, word_embedding_matrix,
    nb_words, 'train', is_duplicate)

X_train = np.stack((q1_data, q2_data), axis=1)
y_train = labels
Q1_train = X_train[:, 0]
Q2_train = X_train[:, 1]

model = model(nb_words, word_embedding_matrix)

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.fit([Q1_train, Q2_train], y_train, batch_size=32, epochs=100)

model_json = model.to_json()
with open("best_weights/model.json", "w") as json_file: