Ejemplo n.º 1
0
def main(args):
    data_train, gold_train = get_data(args.train_path)
    data_dev, gold_dev = get_data(args.dev_path)

    word_to_int, int_to_word = get_vocab(data_train, args.min_word_count)

    vocab_size = len(word_to_int)
    max_len = 100

    train_loader = DataLoader(data_train, gold_train, args.batch_size,
                              word_to_int, args.transformer)
    dev_loader = TestLoader(data_dev, gold_dev, word_to_int, args.transformer)

    lossFunction = nn.CrossEntropyLoss()
    if args.transformer:
        model = Model_T(args.embed_size, args.hidden_size, args.inter_size,
                        vocab_size, max_len, args.n_heads, args.n_layers,
                        args.per_layer, args.dropout_prob_classifier,
                        args.dropout_prob_attn, args.dropout_prob_hidden,
                        args.use_elmo, args.num_rep, args.elmo_drop).cuda()
    elif args.BiLSTM:
        model = Model_B(args.embed_size, args.hidden_size, vocab_size,
                        args.use_elmo, args.num_rep, args.elmo_drop).cuda()
    optimizer = optim.Adam(model.parameters(), lr=args.lr)

    train(model, optimizer, lossFunction, train_loader, dev_loader,
          args.epochs, args.eval_every)
Ejemplo n.º 2
0
	def __init__(self, networkType, numIterations, populationSize, selection, crossover, crossoverProb, mutationProb, nnParams):
		self.networkType = networkType
		self.numberOfIterations = numIterations
		self.populationSize = populationSize
		self.selection = selection
		self.crossover = crossover
		self.crossoverProb = crossoverProb
		self.mutationProb = mutationProb
		self.nnParams = nnParams

		#User specified neural network
		if (self.networkType == "nn"):
			sys.stdout.flush()

			#Load data from cifar10 dataset and 10 output neurons
			self.train_train_data = dl.get_data("nn",'cifar10',10)

		#User specified convolutional neural network
		else:
			sys.stdout.flush()

			#Load data from cifar10
			self.x_y_train_data = dl.get_data("cnn",'cifar10')

		self.population = self.initializePopulation()
Ejemplo n.º 3
0
def print_accuracy(config, model):
    classifier = KNeighborsClassifier(n_neighbors=1)
    X, y = get_data(config)
    X, y = to_var(X), to_var(y)
    if model is not None:
        X = model.g(X)
    X, y = to_data(X), to_data(y)
    classifier.fit(X.reshape(X.shape[0], -1), y.reshape(-1))

    Xhat, yhat = get_data(config, train=False)
    Xhat, yhat = to_var(Xhat), to_var(yhat)
    Xhat, yhat = to_data(Xhat), to_data(yhat)
    pred = classifier.predict(Xhat.reshape(Xhat.shape[0], -1))
    accuracy = (pred == yhat.reshape(-1)).astype(float).mean()
    print('Classification accuracy: %0.4f' % (accuracy))
Ejemplo n.º 4
0
def main(args):
    SEED = 17
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    random.seed(SEED)
    torch.cuda.manual_seed(SEED)

    net = nm.Net()
    path = './trained_model_best.pth'
    d = torch.load(path)
    net.load_state_dict(d['state_dict'])

    net.eval()
    net.cuda()

    train_loader, test_loader = dl.get_data(args.data)
    val_loss = 0.
    criterion = torch.nn.MSELoss()
    for batch_idx, batch in enumerate(test_loader):
        inputs = torch.autograd.Variable(batch).cuda()
        with torch.no_grad():
            output = net(inputs)
        loss = criterion(output, inputs)
        val_loss += loss.cpu().data.numpy() * len(inputs)
    val_loss = val_loss / len(test_loader.dataset)
    print(val_loss)
Ejemplo n.º 5
0
def solve_sequential():
	X, y, n, dim = data_loader.get_data()
	X_trans = X.transpose()
	iteration_cnt = 10000

	opt_loss = optimal_solve(X, y, n)

	bt_beta = 0.5
	bt_alpha = 0.1

	current_beta = np.zeros(dim + 1)

	for i in range(iteration_cnt):
		gradient = X.dot(current_beta) - y
		gradient = X_trans.dot(gradient)

		gradient_norm = np.linalg.norm(gradient, ord=2)
		gradient_norm_sq = gradient_norm ** 2

		step_size = 1
		# back track line search
		while target_func(X, y, current_beta - step_size * gradient) > \
		      target_func(X, y, current_beta) - bt_alpha * step_size * gradient_norm_sq:
			step_size *= bt_beta
		
		current_beta = current_beta - step_size * gradient

		diff = target_func(X, y, current_beta) - opt_loss
		diff /= n
		print(math.log(diff))
Ejemplo n.º 6
0
def train_network():
    print('Loading data')
    (data, labels) = data_loader.get_data()

    print('Dividing data')
    x_train, x_test, y_train, y_test = train_test_split(data,
                                                        labels,
                                                        test_size=0.2,
                                                        random_state=20)

    print(x_train[0].shape)

    # This part could be changed
    model = tf.keras.models.Sequential([
        tf.keras.layers.Flatten(input_shape=(256, 256)),
        tf.keras.layers.Dense(120, activation=tf.nn.relu),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(10, activation=tf.nn.softmax)
    ])
    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

    model.fit(x_train, y_train, batch_size=128, epochs=5)
    a = model.evaluate(x_test, y_test)
    print(a)
    save_network(model)
Ejemplo n.º 7
0
def solve_sequential():
    X, y, n, dim = data_loader.get_data()
    lasso_lam = 1

    opt_val = float('inf')

    X_trans = X.transpose()
    iteration_cnt = 10000

    current_beta = np.zeros(dim + 1)

    for i in range(iteration_cnt):

        rate = 1
        shrink_beta = 0.5

        while True:
            op = X_trans.dot(y - X.dot(current_beta))
            gradient = -op
            op = rate * op
            op = current_beta + op
            try_beta = soft_thres_op(lasso_lam * rate, op)

            gtbeta = (current_beta - try_beta) / rate

            if func_g(X, y, try_beta) > func_g(
                    X, y, current_beta) - rate * gtbeta.dot(gradient) + (
                        (rate / 2) * (np.linalg.norm(gtbeta, ord=2))**2):
                rate *= shrink_beta
            else:
                current_beta = try_beta
                break

        opt_val = min(opt_val, target_func(X, y, lasso_lam, current_beta))
        print(opt_val)
Ejemplo n.º 8
0
def train(dataset: str, learning_rate: float, epochs: int, device: str,
          model: str, beta: float):
    def kl_divergence(means, log_sigma, target_sigma=1.):
        kl_div = target_sigma**-2 * means**2 + 2 * np.log(target_sigma)
        kl_div += -2 * log_sigma + torch.exp(2 * log_sigma) / target_sigma**2
        return torch.mean(torch.sum(kl_div, dim=1))

    device = torch.device(device if torch.cuda.is_available() else 'cpu')

    train_loader, valid_loader = get_data(dataset)

    net_setup = {
        'oscillator': {
            'input_size': 50,
            'input2_size': 1,
            'latent_size': 2,
            'output_size': 1,
            'encoder_units': [100, 100],
            'decoder_units': [100, 100]
        },
        'collision': {
            'input_size': 30,
            'input2_size': 16,
            'latent_size': 1,
            'output_size': 2,
            'encoder_units': [150, 100],
            'decoder_units': [100, 150]
        }
    }[dataset]

    network = SciNet(**net_setup)
    network = network.to(device)

    if model:
        network.load_state_dict(torch.load(model))
        print("Restored weights")

    mse_loss = torch.nn.MSELoss()
    optimizer = torch.optim.Adam(network.parameters(), lr=learning_rate)

    for epoch in tqdm(range(epochs)):
        for i, (x, t, y) in enumerate(train_loader):
            x, t, y = x.to(device), t.to(device), y.to(device)

            optimizer.zero_grad()

            mu, log_sigma, output = network(x, t)

            loss = mse_loss(output, y) + beta * kl_divergence(mu, log_sigma)
            loss.backward()

            torch.nn.utils.clip_grad.clip_grad_value_(network.parameters(),
                                                      10.0)

            optimizer.step()

        print(loss.item())

    torch.save(network.state_dict(), f'{dataset}.pth')
Ejemplo n.º 9
0
def run(args):
    logger.info('Read data:')
    train_A, train_B, test_A, test_B = get_data(args.task, args.image_size)

    logger.info('Build graph:')
    model = CycleGAN(args)

    variables_to_save = tf.global_variables()
    init_op = tf.variables_initializer(variables_to_save)
    init_all_op = tf.global_variables_initializer()
    saver = FastSaver(variables_to_save)

    logger.info('Trainable vars:')
    var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                 tf.get_variable_scope().name)
    for v in var_list:
        logger.info('  %s %s', v.name, v.get_shape())

    if args.load_model != '':
        model_name = args.load_model
    else:
        model_name = '{}_{}'.format(
            args.task,
            datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
    logdir = '/content/drive/My Drive/Colab Notebooks/Cycle/logs'
    makedirs(logdir)
    logdir = os.path.join(logdir, model_name)
    logger.info('Events directory: %s', logdir)
    summary_writer = tf.summary.FileWriter(logdir)

    def init_fn(sess):
        logger.info('Initializing all parameters.')
        sess.run(init_all_op)

    sv = tf.train.Supervisor(
        is_chief=True,
        logdir=logdir,
        saver=saver,
        summary_op=None,
        init_op=init_op,
        init_fn=init_fn,
        summary_writer=summary_writer,
        ready_op=tf.report_uninitialized_variables(variables_to_save),
        global_step=model.global_step,
        save_model_secs=300,
        save_summaries_secs=30)

    if args.train:
        logger.info("Starting training session.")
        with sv.managed_session() as sess:
            model.train(sess, summary_writer, train_A, train_B)

    logger.info("Starting testing session.")
    with sv.managed_session() as sess:
        base_dir = os.path.join(
            '/content/drive/My Drive/Colab Notebooks/Cycle/results',
            model_name)
        makedirs(base_dir)
        model.test(sess, test_A, test_B, base_dir)
Ejemplo n.º 10
0
def get_data_linear():
    data, labels, columns_dict, values_dict = get_data()

    # preprocess the diseases
    disease_dict = {}
    key_to_disease_string = {}
    for k in values_dict[DISEASE_LABEL]:
        diseases = map(str.lower, map(str.strip, k.split(';')))
        key_to_disease_string[values_dict[DISEASE_LABEL][k]] = k
        for d in diseases:
            if d not in disease_dict:
                disease_dict[d] = len(disease_dict)
    print("DISEASES:", len(disease_dict))
    # end disease preprocess

    # DANGEROUS: RAN ONCE TO FIGURE OUT NUM OF ROWS.
    NUM_ROWS = 5528
    global NUM_COLS
    global NUM_COLS_INITIALIZED
    if not NUM_COLS_INITIALIZED:
        for k in columns_dict:
            if k in FLOAT_LABELS:
                NUM_COLS += 1
            elif k not in IGNORE_LABELS:
                NUM_COLS += len(values_dict[k].items())
        NUM_COLS_INITIALIZED = True

        if INCLUDE_DISEASES:
            NUM_COLS += len(disease_dict)
    # do not change the above unless amount of data changes.

    index_labels = {}
    for k in columns_dict:
        index_labels[columns_dict[k]] = k

    linearized_data = np.zeros((NUM_ROWS, NUM_COLS))
    print("SHAPE OF DATA:", linearized_data.shape)
    for i, d in enumerate(data):
        write_index = 0
        for j, val in enumerate(d):
            if index_labels[j] in FLOAT_LABELS:
                linearized_data[i, write_index] = val
                write_index += 1
            elif INCLUDE_DISEASES and index_labels[j] == DISEASE_LABEL:
                diseases = map(
                    str.lower,
                    map(str.strip, key_to_disease_string[val].split(';')))
                for dis in diseases:
                    linearized_data[i, write_index + disease_dict[dis]] = 1
                write_index += len(disease_dict)
            elif index_labels[j] in IGNORE_LABELS:
                continue
            else:
                assert val == int(val), 'Value must be a value index'
                linearized_data[i, write_index + int(val)] = 1
                write_index += len(values_dict[index_labels[j]].items())
        assert write_index == NUM_COLS
    return linearized_data, labels
Ejemplo n.º 11
0
def train(args):
    print("Getting Dataset...")
    source, target = data_loader.get_data(
        args.data_dir)  # TODO make target data
    print("Initializing the Model...")
    model = EncDecModel(args)

    print("Starting training...")
    with tf.Session() as sess:
        tf.initialize_all_variables().run()
        saver = tf.train.Saver(tf.all_variables())

        for i in range(args.epoches):
            batch_x = source[i + 0:i + args.batch_size * args.time_steps *
                             args.input_size]
            charlist = source[i + 1:i + args.batch_size + 1]
            batch_y = np.zeros((len(charlist), args.output_classes))
            for j in range(len(charlist) - 1):
                batch_y[j][int(charlist[j])] = 1.0
            # Reshape batch to input size
            batch_x = batch_x.reshape(
                (args.batch_size, args.time_steps, args.input_size))

            feed = {model.X: batch_x, model.Y: batch_y}
            sess.run(model.optimizer, feed_dict=feed)
            if i % args.display_step == 0:
                # Calculate Accuracy
                summary, acc = sess.run([model.acc_summary, model.accuracy],
                                        feed_dict=feed)
                print "Step: " + str(i) + ", Training Accuracy: " + str(acc)
            if i % 100 == 0 and not (i == 0):
                seq = ''
                x_inp = batch_x
                for j in range(140):
                    index = model.hypothesis_index.eval({
                        model.X: x_inp,
                        model.Y: batch_y
                    })
                    next_letter = unichr(index[0])
                    x_inp = source[i + 0 + 1 + j:i + args.batch_size *
                                   args.time_steps * args.input_size + 1 + j]
                    x_inp[-1] = float(ord(next_letter))
                    x_inp = x_inp.reshape(
                        (args.batch_size, args.time_steps, args.input_size))
                    seq += next_letter
                with open('save/gen' + str(i) + '.txt', 'w+') as f:
                    print "save:\n" + seq
                    f.write(seq)
                    f.close()
            if i % 1000 == 0 and not (i == 0):
                saver.save(sess, "save/" + str(i) + ".ckpt")

        print "Training is COMPLETE!"
        x_test = source[i:i + args.batch_size]
        y_test = source[i * args.batch_size + 1:i * 2 * args.batch_size]
        test_accuracy = sess.run(model.accuracy, feed_dict=feed)
        print("Final test accuracy: %g" % (test_accuracy))
        saver.save(sess, "save/model.ckpt")
Ejemplo n.º 12
0
def get_model(model, ckpt_name, option):
    model_path = ospj('train_log', option.log_dir, ckpt_name)
    ds = get_data('val', option)
    pred_config = PredictConfig(
        model=model,
        session_init=get_model_loader(model_path),
        input_names=['input', 'label', 'bbox'],
        output_names=['wrong-top1', 'top5', 'actmap', 'grad'],
        return_input=True)

    return SimpleDatasetPredictor(pred_config, ds)
Ejemplo n.º 13
0
def main(args):
    SEED = 17
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    random.seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.backends.cudnn.deterministic = True

    #train_loader, test_loader = dl.get_data(args.data)
    train_loader, test_loader = dl.get_data("")
    trainer.train_network(train_loader, test_loader)
Ejemplo n.º 14
0
def train(args):
    print("Getting Dataset...")
    source, target = data_loader.get_data(args.data_dir) # TODO make target data
    print("Initializing the Model...")
    model = EncDecModel(args)

    print("Starting training...")
    with tf.Session() as sess:
        tf.initialize_all_variables().run()
        saver = tf.train.Saver(tf.all_variables())

        for i in range(args.epoches):
            batch_x = source[i+0 : i+args.batch_size * args.time_steps * args.input_size]
            charlist = source[i+1 : i+args.batch_size+1]
            batch_y = np.zeros((len(charlist), args.output_classes))
            for j in range(len(charlist)-1):
                batch_y[j][int(charlist[j])] = 1.0
            # Reshape batch to input size
            batch_x = batch_x.reshape((args.batch_size, args.time_steps, args.input_size))

            feed = {model.X: batch_x, model.Y: batch_y}
            sess.run(model.optimizer, feed_dict=feed)
            if i % args.display_step == 0:
                # Calculate Accuracy
                summary, acc = sess.run([model.acc_summary, model.accuracy], feed_dict=feed)
                print "Step: " + str(i) + ", Training Accuracy: " + str(acc)
            if i % 100 == 0 and not(i==0):
                seq = ''
                x_inp = batch_x
                for j in range(140):
                    index = model.hypothesis_index.eval({
                        model.X: x_inp,
                        model.Y: batch_y
                    })
                    next_letter = unichr(index[0])
                    x_inp = source[i+0+1+j:i+args.batch_size*args.time_steps*args.input_size+1+j]
                    x_inp[-1] = float(ord(next_letter))
                    x_inp = x_inp.reshape((args.batch_size,args.time_steps,args.input_size))
                    seq += next_letter
                with open('save/gen' + str(i) + '.txt', 'w+') as f:
                    print "save:\n" +seq
                    f.write(seq)
                    f.close()
            if i % 1000 == 0 and not(i == 0):
                saver.save(sess,"save/" +str(i) +".ckpt")


        print "Training is COMPLETE!"
        x_test = source[i:i+args.batch_size]
        y_test = source[i*args.batch_size+1:i*2*args.batch_size]
        test_accuracy = sess.run(model.accuracy, feed_dict=feed)
        print ("Final test accuracy: %g" %(test_accuracy))
        saver.save(sess,"save/model.ckpt")
Ejemplo n.º 15
0
def run_model():
    data, true_labels = ldl.get_data_linear()
    true_buckets = [util.bucket(t) for t in true_labels]

    data = np.tile(data, (DATA_MULTIPLIER, 1))
    print("DATA SHAPE:", data.shape)
    true_buckets = np.tile(true_buckets, DATA_MULTIPLIER)

    # tuples of (batch_id, total regret, error while training, eval error, precision, recall)
    batch_results = []

    for T in range(NUM_BATCHES):
        model = Lin_UCB(ALPHA)
        #model = LASSO_BANDIT()
        if False:
            data, true_labels, columns_dict, values_dict = dl.get_data()
            true_buckets = [util.bucket(t) for t in true_labels]
        #model = Fixed_Dose(columns_dict, values_dict)
        #model = Warfarin_Clinical_Dose(columns_dict, values_dict)
        #model = Warfarin_Pharmacogenetic_Dose(columns_dict, values_dict)

        batch_id = str(random.randint(100000, 999999))
        print()
        print("Start Batch: ", batch_id)

        zipped_data = list(zip(data, true_buckets))
        random.shuffle(zipped_data)
        data, true_buckets = zip(*zipped_data)
        data = np.array(data)

        model.train(data, true_buckets)
        pred_buckets = model.evaluate(data)
        print(batch_id, "Performance on " + str(model))
        acc, precision, recall = util.evaluate_performance(
            pred_buckets, true_buckets)
        print("\tAccuracy:", acc)
        print("\tPrecision:", precision)
        print("\tRecall:", recall)

        plot_regret(model.regret, ALPHA, batch_id)
        plot_error_rate(model.error_rate, ALPHA, batch_id)

        batch_results.append(
            (batch_id, model.get_regret()[-1], model.get_error_rate()[-1],
             1 - acc, precision, recall))

        with open('batch/regret' + str(model) + batch_id, 'wb') as fp:
            pickle.dump(model.regret, fp)
        with open('batch/error' + str(model) + batch_id, 'wb') as fp:
            pickle.dump(model.error_rate, fp)

    return batch_results
Ejemplo n.º 16
0
def train_model():
    (data, label) = data_loader.get_data()
    data = data.reshape((len(data), 256 * 256))

    x_train, x_test, y_train, y_test = train_test_split(data,
                                                        label,
                                                        test_size=0.2,
                                                        random_state=20)

    clf = LogisticRegression(solver='lbfgs', multi_class='multinomial')

    clf.fit(x_train, y_train)
    save_model(clf)
Ejemplo n.º 17
0
def train_model(model_name='random_forest', verbose=False):
    X_train, X_test, y_train, y_test = get_data(num_days_per_month=7)

    # train model
    model = build_train_model(model_name, X_train, y_train, vb=verbose)

    # evaluate model
    evaluate(model, X_test, y_test, model_name=model_name, plot_probs=True)

    # save model
    os.chdir("../../../../../../home/sdas11/")
    joblib.dump(model, f'{model_name}.model')

    feat_imp(model, model_name=model_name)
 def setUp(self):
     """
     This method sets the variables for the following tests.
     """
     self._m = 100
     self._n = 30
     self._k = 5
     self._increment = 20
     self._A = get_data(ExperimentType.ExampleNo2)(self._m, np.arange(2 * self._k).astype(float))
     self._approximation = random_svd(self._A, self._k, self._increment)
     self._U = self._approximation.U
     self._sigma = np.array(self._approximation.sigma)
     self._VT = self._approximation.V.T
     self._approximation = self._approximation.as_numpy_arr()
     self._A = self._A.as_numpy_arr()
 def setUp(self):
     """
     This method sets the variables for the following tests.
     """
     self._m = 100
     self._n = 30
     self._k = 5
     self._increment = 20
     self._A = get_data(ExperimentType.ExampleNo2)(self._m, np.arange(2 * self._k).astype(float))
     self._approximation = random_id(self._A, self._k, self._increment)
     self._B = self._approximation.B
     self._P = np.array(self._approximation.P)
     self._A = self._A.as_numpy_arr()
     self._n = self._A.shape[1]
     self._approximation = self._approximation.as_numpy_arr()
Ejemplo n.º 20
0
def config():
    """ Config section

    This function contains all possible configuration for all experiments. Full details on each configuration values
    can be found in :mod:`enums.py`.
    """

    experiment_type: str = ExperimentType.ExampleNo5
    singular_values: RowVector = choose_singular_values(experiment_type)
    used_data_factory: Callable = get_data(experiment_type)
    data_sizes: List = choose_data_sizes(experiment_type)
    approximation_ranks: List = choose_approximation_ranks(experiment_type)
    increments: List = choose_increments(experiment_type)
    results_path: str = r'Results/'
    power_method_iterations: int = 100
Ejemplo n.º 21
0
def train_all(verbose=False):
    X_train, X_test, y_train, y_test = get_data(num_days_per_month=7)

    for model_name in  ['NN', 'random_forest', 'xgboost_clf', 'logistic_regression', 'svm', 'naive_bayes']:
        print(model_name.upper(), ':')
        # train model
        model = build_train_model(model_name, X_train, y_train, vb=verbose)

        # evaluate model
        evaluate(model, X_test, y_test, model_name=model_name, plot_probs=False)

        # save model
        os.chdir("../../../../../../home/sdas11/")
        joblib.dump(model, f'{model_name}.model')
        print('\n')
        feat_imp(model, model_name=model_name, show_plot=False)
        print('\n')
    def __init__(self):     
        
        _, self.data_dict, all_features, all_objects = data_loader.get_data()

        self.unexamined_features = all_features[:]
        for f in feature_blacklist: self.unexamined_features.remove(f)
        self.known_features = {}
        self.probabilities = {}
        self.probability_features_all_objects = {}
        self.questions_asked = 0
        self.unexamined_objects = all_objects[:]
        self.ordered_probs = []
        
        for o in self.unexamined_objects:
            self.probability_features_all_objects[o] = 1.0
            self.probabilities[o] = self.prior_probability(o)
            
        self.cur_entropy = self.entropy(self.probabilities)
Ejemplo n.º 23
0
def run(target, start=0, iteration=10, batch_size=256, epochs=500, out='./model', name=None):
    x_train, y_train, x_test = get_data(target=target, name=name)
    print(f'Kind of Data: {len(x_train)}')
    
    x_pred_list, y_pred_list, m_pred_list, v_pred_list = [], [], [], []
    for i in range(iteration):
        model = set_model(target, x_train, out=out, name=name)

        hist = train(model, x_train, y_train,
                     epochs=epochs,
                     batch_size=batch_size,
                     label=LABEL[target],
                     seq=start+i,
                     out=out)
        best_model = load_best_model(target, label=LABEL[target], seq=start+i, out=out)
        pred_data_test = best_model.predict(x_test)
        if target == 0:
            x_pred_list.append(pred_data_test[:,0])
            y_pred_list.append(pred_data_test[:,1])
        if target == 1 or target == 3:
            m_pred_list.append(pred_data_test[:,2])
        if target == 2 or target == 3:
            v_pred_list.append(pred_data_test[:,3])

    if target == 0:
        x_pred = np.mean(x_pred_list, axis=0)
        y_pred = np.mean(y_pred_list, axis=0)
    if target == 1 or target == 3:
        m_pred = np.mean(m_pred_list, axis=0)
    if target == 2 or target == 3:
        v_pred = np.mean(v_pred_list, axis=0)
    
    # submit
    submit = pd.read_csv('xy_mlp_ensemble10_2nd.csv')
    if target == 0:
        submit.iloc[:, 1] = x_pred
        submit.iloc[:, 2] = y_pred
    if target == 1 or target == 3:
        submit.iloc[:, 3] = m_pred
    if target == 2 or target == 3:
        submit.iloc[:, 4] = v_pred
    submit.to_csv(os.path.join(out, 'mv_ensemble10.csv'), index=None)
def train():
    # the data, split between train and test sets
    (data, labels) = data_loader.get_data()
    x_train, x_test, y_train, y_test = train_test_split(data,
                                                        labels,
                                                        test_size=0.2,
                                                        random_state=20)
    x_train = x_train.reshape(len(x_train), 256 * 256)
    x_test = x_test.reshape(len(x_test), 256 * 256)
    x_train = x_train.astype('float32')
    x_test = x_test.astype('float32')

    pca = PCA(n_components=50)
    x_train = pca.fit_transform(x_train)
    x_test = pca.transform(x_test)

    svm = SVC()
    parameters = [{
        'kernel': ['rbf'],
        'gamma': [1e-3, 1e-4],
        'C': [1, 10, 100, 1000]
    }]
    # {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
    grid = GridSearchCV(svm, parameters, verbose=3)
    grid.fit(x_train[0:7000],
             y_train[0:7000])  # grid search learning the best parameters

    print(grid.best_params_)

    # Now we train the best estimator in the full dataset
    best_svm = grid.best_estimator_
    best_svm.fit(x_train, y_train)
    print("svm done")

    print("Testing")
    print("score: ", best_svm.score(
        x_test,
        y_test,
    ))
Ejemplo n.º 25
0
def solve_distributed():
    X, y, n, dim = data_loader.get_data()
    X_trans = X.transpose()
    iteration_cnt = 100

    opt_loss = optimal_solve(X, y)

    all_staleness = [0, 5, 10, 20, 50]
    colors = ['yellow', 'blue', 'black', 'red', 'green']

    worker_cnt = 4

    bt_beta = 0.5
    bt_alpha = 0.1

    worker_data_X = []
    worker_data_X_trans = []
    worker_data_Y = []
    each_worker_data_cnt = n // worker_cnt
    for i in range(worker_cnt):
        worker_data_X.append(X[i * each_worker_data_cnt:(i + 1) *
                               each_worker_data_cnt])
        worker_data_Y.append(y[i * each_worker_data_cnt:(i + 1) *
                               each_worker_data_cnt])
        worker_data_X_trans.append(worker_data_X[i].transpose())

    for staleness in all_staleness:
        event_seq = seq_gen.gen_event_sequence(worker_cnt, staleness,
                                               iteration_cnt)
        worker_param = {i: np.zeros(dim + 1) for i in range(worker_cnt)}
        current_server_beta = np.zeros(dim + 1)

        current_loss = []

        for event in event_seq:
            event_type, worker_num = event.split("_")
            worker_num = int(worker_num)
            assert (event_type in ["push", "pull"])

            if event_type == "push":
                local_X = worker_data_X[worker_num]
                local_X_trans = worker_data_X_trans[worker_num]
                local_y = worker_data_Y[worker_num]
                local_beta = worker_param[worker_num]

                gradient = local_X.dot(local_beta) - local_y
                gradient = local_X_trans.dot(gradient)

                gradient_norm = np.linalg.norm(gradient, ord=2)
                gradient_norm_sq = gradient_norm**2

                step_size = 1
                # back track line search
                while target_func(X, y, current_server_beta - step_size * gradient) > \
                      target_func(X, y, current_server_beta) - bt_alpha * step_size * gradient_norm_sq:
                    step_size *= bt_beta
                current_server_beta -= step_size * gradient

                diff = target_func(X, y, current_server_beta) - opt_loss
                diff /= n
                current_loss.append(math.log(diff))

                print(math.log(diff))

            else:
                worker_param[worker_num] = np.copy(current_server_beta)
        plt.plot([i for i in range(len(current_loss))],
                 list(current_loss),
                 color=colors[all_staleness.index(staleness)],
                 label=('Staleness: ' + str(staleness)))
    plt.title('Linear regression with backtrack linea search')
    plt.legend()
    plt.show()
Ejemplo n.º 26
0
def test_data_baseline(alg, data, true_labels):
    print()
    alg.train(data, true_labels)

    labels = list(map(util.bucket, alg.evaluate(data)))
    true_labels = list(map(util.bucket, true_labels))

    print("##### " + str(alg) + "#####")
    #acc = util.get_accuracy_bucketed(labels, true_labels)
    #print("accuracy on data with " + str(alg) + ": " + str(acc))

    acc, precision, recall = util.evaluate_performance(labels, true_labels)
    #print("bucket accuracy with " + str(alg) + ":" + str(bucket_acc))


if __name__ == '__main__':

    data, true_labels, columns_dict, values_dict = dl.get_data()

    fixed = Fixed_Dose(columns_dict, values_dict)

    #test_dummy_baseline(fixed)
    test_data_baseline(fixed, data, true_labels)

    clinical = Warfarin_Clinical_Dose(columns_dict, values_dict)
    test_data_baseline(clinical, data, true_labels)

    pharma = Warfarin_Pharmacogenetic_Dose(columns_dict, values_dict)
    test_data_baseline(pharma, data, true_labels)
Ejemplo n.º 27
0
    exp_io.set_logging(
        args.log_level,
        args.log_file,
        filemode=args.log_filemode,
    )

    args.out_file = os.path.join(args.output_dir, args.out_file + '.csv')

    return args


if __name__ == '__main__':
    args = parse_args()

    features, labels = data_loader.get_data(
        data_path=args.in_file,
        k=args.lsb_bits,
    )

    rnn = BranchRNN(
        #features.shape[1],
        1,
        units=args.units,
        hidden_layers=args.hidden_layers,
        window_size=args.window_size,
        tfboard_log=args.tfboard_log,
        update_freq=args.tfboard_freq,
        cudnn=args.cudnn,
        gru=args.gru,
        batch_size=args.batch_size,
        history_size=args.history_size,
        epochs=args.epochs,
Ejemplo n.º 28
0
def solve_distributed_fixed():
    X, y, n, dim = data_loader.get_data()
    X_trans = X.transpose()
    iteration_cnt = 5000

    # step size 0.000001 , staleness = 5 converges, staleness = 100 diverges
    step_size = 0.0000007

    opt_loss = optimal_solve(X, y)

    all_staleness = [0, 5, 10, 20, 50]
    colors = ['yellow', 'blue', 'black', 'red', 'green']
    worker_cnt = 4

    worker_data_X = []
    worker_data_X_trans = []
    worker_data_Y = []
    each_worker_data_cnt = n // worker_cnt
    for i in range(worker_cnt):
        worker_data_X.append(X[i * each_worker_data_cnt:(i + 1) *
                               each_worker_data_cnt])
        worker_data_Y.append(y[i * each_worker_data_cnt:(i + 1) *
                               each_worker_data_cnt])
        worker_data_X_trans.append(worker_data_X[i].transpose())

    for staleness in all_staleness:
        event_seq = seq_gen.gen_event_sequence(worker_cnt, staleness,
                                               iteration_cnt)
        worker_param = {i: np.zeros(dim + 1) for i in range(worker_cnt)}
        current_server_beta = np.zeros(dim + 1)

        current_loss = []

        for event in event_seq:
            event_type, worker_num = event.split("_")
            worker_num = int(worker_num)
            assert (event_type in ["push", "pull"])

            if event_type == "push":
                local_X = worker_data_X[worker_num]
                local_X_trans = worker_data_X_trans[worker_num]
                local_y = worker_data_Y[worker_num]
                local_beta = worker_param[worker_num]

                gradient = local_X.dot(local_beta) - local_y
                gradient = local_X_trans.dot(gradient)

                current_server_beta -= step_size * gradient

                diff = target_func(X, y, current_server_beta) - opt_loss
                diff /= n

                current_loss.append(math.log(diff))
                print(math.log(diff))

            else:
                worker_param[worker_num] = np.copy(current_server_beta)

        loss_to_plot = [
            current_loss[4 * i] for i in range(len(current_loss) // 4)
        ]
        plt.plot([i for i in range(len(loss_to_plot))],
                 loss_to_plot,
                 color=colors[all_staleness.index(staleness)],
                 label=('Staleness: ' + str(staleness)))
        # plt.plot([i for i in range(len(current_loss))], list(current_loss), color=colors[all_staleness.index(staleness)], label=('Staleness: ' + str(staleness)))
    plt.title('Linear regression with learning rate: ' + str(step_size))
    plt.legend()
    plt.show()
Ejemplo n.º 29
0
def run():
    print('Loading data...')

    #data_folder = '../data/weibo_xiaoice_large'
    training, validation, test, embedding_matrix, label_map, VOCAB = get_data(
        USE_FA=False, USE_GLOVE_EMBED=True)
    tr_gen = batch_generator(training[0],
                             training[-1],
                             batch_size=256,
                             shuffle=True)
    te_gen = batch_generator(validation[0],
                             validation[-1],
                             batch_size=1024,
                             shuffle=False)

    print('VOCAB size:{}'.format(VOCAB))

    # Summation of word embeddings
    LAYERS = 1
    USE_GLOVE = True
    TRAIN_EMBED = False
    EMBED_HIDDEN_SIZE = 256
    SENT_HIDDEN_SIZE = 256
    BATCH_SIZE = 512
    PATIENCE = 6  # 8
    MAX_EPOCHS = 100
    MAX_LEN_TITLE = 30
    MAX_LEN_DES = 128
    DP = 0.2
    L2 = 4e-06
    ACTIVATION = 'relu'
    OPTIMIZER = 'rmsprop'
    # OPTIMIZER = 'adadelta'
    MLP_LAYER = 1
    NGRAM_FILTERS = [1, 2, 3, 4]
    NUM_FILTER = 128
    RNN_Cell = 'BiLSTM'

    print('Embed / Sent = {}, {}'.format(EMBED_HIDDEN_SIZE, SENT_HIDDEN_SIZE))
    print('GloVe / Trainable Word Embeddings = {}, {}'.format(
        USE_GLOVE, TRAIN_EMBED))

    LABEL_NUM = len(label_map.classes_)

    bst_model_path = '../model/rnn_attn_v2.hdf5'
    pred_path = '../res/rnn_attn_v2.pkl'
    res_path = '../res/rnn_attn_v2.csv'

    embed_title = get_embedding(embedding_matrix, USE_GLOVE, VOCAB,
                                EMBED_HIDDEN_SIZE, TRAIN_EMBED, MAX_LEN_TITLE)

    embed_des = get_embedding(embedding_matrix, USE_GLOVE, VOCAB,
                              EMBED_HIDDEN_SIZE, TRAIN_EMBED, MAX_LEN_DES)

    model = rnn_att_model(embed_title,
                          embed_des,
                          MAX_LEN_TITLE,
                          MAX_LEN_DES,
                          SENT_HIDDEN_SIZE,
                          ACTIVATION,
                          DP,
                          L2,
                          LABEL_NUM,
                          OPTIMIZER,
                          MLP_LAYER,
                          LAYERS,
                          RNN_Cell='BiLSTM')

    early_stopping = EarlyStopping(monitor='val_top_k_categorical_accuracy',
                                   patience=4)
    model_checkpoint = ModelCheckpoint(bst_model_path,
                                       save_best_only=True,
                                       save_weights_only=True)
    #print training[0][0].shape[0],validation[0][0].shape[0]
    #model.load_weights(bst_model_path)
    model.fit_generator(
        tr_gen,
        steps_per_epoch=int(training[0][0].shape[0] / BATCH_SIZE) + 1,
        epochs=100,
        verbose=1,
        validation_data=te_gen,
        validation_steps=int(validation[0][0].shape[0] / BATCH_SIZE) + 1,
        max_q_size=20,
        callbacks=[early_stopping, model_checkpoint])

    print 'load weights'
    model.load_weights(bst_model_path)
    pred = model.predict(test)
    pd.to_pickle(pred, pred_path)

    def get_ans(pred=pred, idd=np.random.random(3000)):
        pred = pred.argsort(axis=1)[:, ::-1][:, :5]
        ll = label_map.classes_
        ans = [[ll[item] for item in items] for items in pred]
        res = pd.DataFrame(ans)
        res.index = idd
        return res

    test_idx = get_testid()
    ans = get_ans(pred, test_idx)
    ans.to_csv(res_path, index=True, header=False)
    
from sqlalchemy import create_engine, MetaData, Table
import numpy as np
import json
#from matplotlib import pyplot as plt
import cPickle as pickle
#from runner_nonbayesian import NonBayesianPlayer

#from matplotlib import pyplot as plt
import scipy.stats as scistats
from multiprocessing import Pool
import data_loader
import copy

import cPickle as pickle
data_matrix, data_dict, features, items = data_loader.get_data()


db_url = "mysql://*****:*****@gureckislab.org/mt_experiments"
table_name = '20q_model_tester_exp2'
data_column_name = 'datastring'
# boilerplace sqlalchemy setup
engine = create_engine(db_url)
metadata = MetaData()
metadata.bind = engine
table = Table(table_name, metadata, autoload=True)
# make a query and loop through
s = table.select()
rows = s.execute()

Ejemplo n.º 31
0
import sys
from genetic_mapper import Mapper
import data_loader

if __name__ == "__main__":
    if (len(sys.argv) <= 1):
        print("Please enter path to frequency file!")
    else:
        freq_file = sys.argv[1]
        phenotypeFreq = data_loader.get_data(freq_file)
        print(phenotypeFreq)
        Mapper(phenotypeFreq).solve().print_solution()
Ejemplo n.º 32
0
  model_result_filename_prefix = {"perceptron": "perceptron-",
                                  "slp": "neural-net-single-",
                                  "mlp": "neural-net-ml-",
                                  "lstm": "LSTM-pytorch-",
                                  "gru": "GRU-pytorch-"}

  output_file = open(output_dir_path + "metrics_interval.txt", "w")

  for model in ["perceptron", "slp", "mlp", "lstm", "gru"]:
    for file_size in ["56", "10K", "8M"]:
      trace_filename = "gcc-" + file_size + ".txt"
      trace_filepath = trace_dir_path + trace_filename
      results_filepath = results_dir_path + model_result_filename_prefix[model] + trace_filename

      trace_features, trace_labels = data_loader.get_data(data_path=trace_filepath, k=8)
      result_labels = result_data_loader.get_data(data_path=results_filepath)

      for metric in [accuracy_score, f1_score, mutual_info_score, confusion_matrix, matthews_corrcoef]:
        metric_output = interval_result(trace_labels, result_labels, metric, 4)

        output_file.write("***> " + model + " " + file_size + " " + metric.__name__ + ":\n")
        output_file.write(str(metric_output))
        output_file.write("\n\n\n")

  output_file.close()

  # (m,n) baseline
  output_file_mn = open(output_dir_path + "metrics_interval_mn.txt", "w")

  for file_size in ["56", "10K", "8M"]:
'''
Created on Nov 23, 2015

@author: alxcoh
'''
import numpy as np
import data_loader
import scipy.stats as scistats
from runner_clust import ClustPlayer
from multiprocessing import Pool
data_matrix, _, features, objects = data_loader.get_data()

full_probability_matrix_goodmethod = [[ 0.70385265,  0.31086353,  0.18540700,  0.12579383,  0.06243025],
                                      [ 0.09839965,  0.27167072,  0.12949217,  0.08246418,  0.03858618],
                                      [ 0.09718542,  0.21443453,  0.37983041,  0.20084763,  0.09890584],
                                      [ 0.05635820,  0.11671840,  0.17166805,  0.31256788,  0.17611042],
                                      [ 0.04420408,  0.08631282,  0.13360238,  0.27832648,  0.62396730]]

logfile = open("clustmaker_logfile.txt", 'w')

def get_clusters_given_posterior(n_clusters, current_clustering, posterior = np.repeat(0.01, (1000)), knowledge = []):
    #probget = np.all(0.2, (n_clusters, 218, 5), dtype=np.float64)
 
    did_shift = True
    clustering = current_clustering[:]
    iterations = 0
    while did_shift:
        iterations += 1
        print "\n\n*****************\nIterations:", iterations, "\n", "Clustering:", clustering, "\n*********************\n\n\n"
        did_shift = False
        #curval = cluster_validity(clustering)
Ejemplo n.º 34
0
all_results = {}
final_results = {}
actual_results = {}
first_time = True
predictions = {}
actuals = {}

for metric_to_estimate in intersting_metrics:

    predictions[metric_to_estimate] = []
    actuals[metric_to_estimate] = []

    if future:
        # Use 2018 data to predict 2019 labels
        df, labels = get_data(2018,
                              2018,
                              target=metric_to_estimate,
                              future=future)
    else:
        # Use 2019 data to predict 2019 dlabels
        df, labels = get_data(2019,
                              2019,
                              target=metric_to_estimate,
                              future=future)

    # Extract names of players in a given year
    names = np.array(labels[['Name']].values.tolist())
    names = names[:, 0]
    print(metric_to_estimate)
    # Extract data for that metric
    metric = np.array(labels[[metric_to_estimate]].values.tolist())
    metric = metric[:, 0]
    from tree_variational import *
    from runner_variational import *
    
#from sqlalchemy import create_engine, MetaData, Table
import numpy as np
import json
#from matplotlib import pyplot as plt
import cPickle as pickle
#from runner_nonbayesian import NonBayesianPlayer

#from matplotlib import pyplot as plt
import scipy.stats as scistats
from multiprocessing import Pool
import data_loader

data_matrix, data_dict, features, objects = data_loader.get_data()

fulldatafile = open(base_path + "src/analysis_files/datalogs/fulldata_correctorder.txt", 'w')


'''
db_url = "mysql://*****:*****@gureckislab.org/mt_experiments"
table_name = '20q_model_tester'
data_column_name = 'datastring'
# boilerplace sqlalchemy setup
engine = create_engine(db_url)
metadata = MetaData()
metadata.bind = engine
table = Table(table_name, metadata, autoload=True)
# make a query and loop through
s = table.select()