def test_run(): dates = pd.date_range('2014-01-01', '2015-02-10') symbols = ['SPY', 'googl', 'gld'] df = get_data(symbols, dates) #plot_data(df) dailyReturns = compute_daily_returns(df) #plot_data(dailyReturns,title='daily returns',ylabel='daily returns',xlabel='date') #fig, axes = plt.subplots(nrows=2, ncols=1) #print(axes) dailyReturns.plot(kind='scatter', x='SPY', y='GOOGL') betaGoogle, alphaGoogle = np.polyfit(dailyReturns['SPY'], dailyReturns['GOOGL'], 1) #using the line equation plot the scatter plot with the line. beta is the slope and alpha is the intersect with the x axis plt.plot(dailyReturns["SPY"], betaGoogle * dailyReturns['SPY'] + alphaGoogle, '-', color='red') #plt.subplot(2,1,1) dailyReturns.plot(kind='scatter', x='SPY', y='GLD') betaGold, alphaGold = np.polyfit(dailyReturns['SPY'], dailyReturns['GLD'], 1) plt.plot(dailyReturns['SPY'], betaGold * dailyReturns['SPY'] + alphaGold, '-', color='red') print('Alpha Gold:', alphaGold) print('Beta Gold:', betaGold) print('Alpha Google:', alphaGoogle) print('Beta Google:', betaGoogle) plt.show() print(dailyReturns.corr(method='pearson'))
def get_trainable_data(): tr, te, mean, std = get_data() print(tr.head()) print(te.head()) x_train, y_train = df_to_keras_format(tr) x_test, y_test = df_to_keras_format(te) return (x_train, y_train), (x_test, y_test), mean, std
def main(_): data = get_data(FLAGS.data) train, val, test = split_data(data) out_count = 1 input_count = train.shape[1] - out_count batch_xs, batch_ys = get_batch(train, 2**4, input_count) call_model(FLAGS, 'model', batch_xs) print(batch_ys)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--file_paths', default="data/files.txt") parser.add_argument('--landmark_paths', default="data/landmarks.txt") parser.add_argument('--landmark', type=int, default=0) parser.add_argument('--save_path') parser.add_argument('--num_epochs', type=int, default=int(1e9)) parser.add_argument('--log_freq', type=int, default=100) parser.add_argument('--separator', default=",") parser.add_argument('--batch_size', type=int, default=8) args = parser.parse_args() file_paths = args.file_paths landmark_paths = args.landmark_paths landmark_wanted = args.landmark num_epochs = args.num_epochs log_freq = args.log_freq save_path = args.save_path x, y = get_data(file_paths, landmark_paths, landmark_wanted, separator=args.separator) print(f"Got {len(x)} images with {len(y)} landmarks") device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print("device", device) dataset = TensorDataset(torch.Tensor(x), torch.Tensor(y)) dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True) unet = UNet(in_dim=1, out_dim=6, num_filters=4) criterion = torch.nn.CrossEntropyLoss(weight=get_weigths(y)) optimizer = optim.SGD(unet.parameters(), lr=0.001, momentum=0.9) unet.to(device) for epoch in range(num_epochs): running_loss = 0.0 for i, data in enumerate(dataloader): inputs, labels = data optimizer.zero_grad() outputs = unet(inputs) loss = criterion(outputs, labels.long()) loss.backward() optimizer.step() running_loss += loss.item() print(f"[{epoch+1}/{num_epochs}] loss: {running_loss}") if epoch % log_freq == log_freq - 1: if save_path is not None: torch.save(unet.state_dict(), os.path.join(save_path, f"unet-{epoch}.pt"))
def plot_2_histograms(): df = get_data(['SPY', 'GOOGL'], dates) #plot_data(df) daily_returns = compute_daily_returns(df) #plot_data(daily_returns, title="Daily returs") #daily_returns.hist(bins=20) daily_returns['SPY'].hist(bins=20, label='SPY') daily_returns['GOOGL'].hist(bins=20, label='GOOGL') plt.legend(loc='upper right') plt.show()
def _test(train_args, pretrain_args, args): """Test saved model on specified speakers.""" print('Testing', ', '.join(args.speakers), '...') # update args with new test args test_args = utils.set_new_args(train_args, args) # get test data and id_to_word lookup _, _, test_data, id_to_word = data_reader.get_data(test_args) # set configurations/hyperparameters for model _, test_config = utils.set_config(test_args, id_to_word) # model requires init embed but this will be overridden by restored model init_embed = utils.init_embedding(id_to_word, dim=test_args.embed_size, init_scale=test_args.init_scale, embed_path=test_args.embed_path) with tf.Graph().as_default(): with tf.name_scope('Test'): with tf.variable_scope('Model', reuse=None): m_test = model.Model(test_args, is_training=False, config=test_config, init_embed=init_embed, name='Test') m_test.build_graph() init = tf.global_variables_initializer() # if pretrained, must create dict to initialize TF Saver if bool(pretrain_args): # get trainable variables and convert to dict for Saver reuse_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) reuse_vars_dict = dict([(var.op.name, var) for var in reuse_vars]) # create saver for TF session (see function for addl details) saver = utils.create_tf_saver(args, pretrain_args, reuse_vars_dict) else: saver = tf.train.Saver() with tf.Session() as sess: sess.run(init) print('Restoring model...') saver.restore(sess, test_args.load_path) # test model on specified speakers for test_ind, test_speaker in enumerate(test_args.speakers): for train_ind, train_speaker in enumerate(train_args.speakers): print('Testing {0} with {1} model'.format( test_speaker, train_speaker)) test_perplexity = _run_epoch(sess, m_test, test_args, test_data, train_ind, test_ind) print('Test Perplexity: {0:.3f}'.format(test_perplexity))
def main(): filepath = "spam.dat" x,X_test,y,Y_test = data_reader.get_data(filepath) confusion = MLP.runalt(x,y, X_test, Y_test, display=True, optimize=False) confusion = DTC.run(x,y, X_test, Y_test, display=True, optimize=False) # confusion = DTC.runRFE(x,y, X_test, Y_test, display=True) # confusion = SVM.run(x,y, X_test, Y_test, True) # confusion = Bayes.run(x,y, X_test, Y_test, True) confusion = KNN.runalt(x,y, X_test, Y_test, display=True, optimization=False) plt.show()
def run_program(model_path, data_path): place = fluid.CPUPlace() inputs = [] labels = [] config = None if test_args.use_ptq: warmup_data, inputs, labels = get_data_with_ptq_warmup( data_path, place) config = set_config_ptq(model_path, warmup_data) else: inputs, labels = get_data(data_path, place) config = set_config(model_path) predictor = create_paddle_predictor(config) all_hz_num = 0 ok_hz_num = 0 all_ctc_num = 0 ok_ctc_num = 0 dataset_size = len(inputs) start = time.time() for i in range(dataset_size): if i == test_args.warmup_iter: start = time.time() hz_out, ctc_out = predictor.run([inputs[i]]) np_hz_out = np.array(hz_out.data.float_data()).reshape(-1) np_ctc_out = np.array(ctc_out.data.int64_data()).reshape(-1) out_hz_label = np.argmax(np_hz_out) this_label = labels[i] this_label_data = np.array(this_label.data.int32_data()).reshape(-1) if this_label.shape[0] == 1: all_hz_num += 1 best = this_label_data[0] if out_hz_label == best: ok_hz_num += 1 if this_label_data[0] <= 6350: all_ctc_num += 1 if np_ctc_out.shape[0] == 1 and np_ctc_out.all( ) == this_label_data.all(): ok_ctc_num += 1 else: all_ctc_num += 1 if np_ctc_out.shape[0] == this_label.shape[0] and np_ctc_out.all( ) == this_label_data.all(): ok_ctc_num += 1 if all_ctc_num > 1000 or all_hz_num > 1000: break end = time.time() fps = (dataset_size - test_args.warmup_iter) / (end - start) hx_acc = ok_hz_num / all_hz_num ctc_acc = ok_ctc_num / all_ctc_num return hx_acc, ctc_acc, fps
def _generate(train_args, pretrain_args, args): """Restore trained model and use to generate sample text.""" # update args with new generate args gen_args = utils.set_new_args(train_args, args) # get id_to_word lookup _, _, _, id_to_word = data_reader.get_data(gen_args) # # get hyperparameters corresponding to text generation gen_config, _ = utils.set_config(gen_args, id_to_word) # model requires init embed but this will be overridden by restored model init_embed = utils.init_embedding(id_to_word, dim=gen_args.embed_size, init_scale=gen_args.init_scale, embed_path=gen_args.embed_path) with tf.Graph().as_default(): # use Train name scope as this contains trained model parameters with tf.name_scope('Train'): with tf.variable_scope('Model', reuse=None): m_gen = model.Model(gen_args, is_training=False, config=gen_config, init_embed=init_embed, name='Generate') m_gen.build_graph() init = tf.global_variables_initializer() # if pretrained, must create dict to initialize TF Saver if bool(pretrain_args): # get trainable variables and convert to dict for Saver reuse_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) reuse_vars_dict = dict([(var.op.name, var) for var in reuse_vars]) # create saver for TF session (see function for addl details) saver = utils.create_tf_saver(args, pretrain_args, reuse_vars_dict) else: saver = tf.train.Saver() with tf.Session() as sess: sess.run(init) print('Restoring model...') saver.restore(sess, gen_args.load_path) # generate text for all specified speakers for gen_ind, gen_speaker in enumerate(gen_args.speakers): print('Generating text for {0}'.format(gen_speaker)) for train_ind, train_speaker in enumerate(train_args.speakers): if gen_speaker == train_speaker: generate_text(sess, m_gen, id_to_word, train_ind, args.temp)
def run_algorithm(algorithm): if algorithm == 'apriori': data = get_data("shopping.json") apriori_result = apriori(data, 0.3, 0.5) apriori_result = apriori_result.sort_values(by=['ir'], ascending=[True]) plt.show( apriori_result.query('ir > 0.0').plot(kind='bar', x='rule', y=['ir', 'kulczynski'], rot=45, fontsize=6)) print(apriori_result)
def main(_): input_count = 12 kde_models = get_kde_models(FLAGS.kde_model) data = get_data(FLAGS.data) plus = data.loc[(data['change'] == 1)] minus = data.loc[(data['change'] == 0)] names = [ # "current_slice", "blue", "blue_1", "green", "green_1", "red", "red_1", "nir", "nir_1", "swir1", "swir1_1", "swir2", "swir2_1" ] plus = np.array(plus[names]) minus = np.array(minus[names]) # plus_sample = plus[:10, :] # minus_sample = minus[:10, :] # expected = eval_kde(kde_models, plus_sample) # recived = call_model(FLAGS, 'model', plus_sample) # print(np.hstack([expected, recived])) # expected = eval_kde(kde_models, minus_sample) # recived = call_model(FLAGS, 'model', minus_sample) # print(np.hstack([expected, recived])) density_plus = call_model(FLAGS, 'model', plus) print(density_plus) density_minus = call_model(FLAGS, 'model', minus) print(density_minus) true_plus = density_plus[:, 0] > density_plus[:, 1] true_plus_prop = 1.0 * sum(true_plus.astype(np.int)) / len(true_plus) true_minus = density_minus[:, 1] > density_minus[:, 0] true_minus_prop = 1.0 * sum(true_minus.astype(np.int)) / len(true_minus) print('Plus prob', true_plus_prop) print('Minus prob', true_minus_prop)
def test_run(): df = get_data([], dates) #plot_data(df) daily_returns = compute_daily_returns(df) #plot_data(daily_returns, title='Daily Returns', ylabel='Daily returns') #plot daily_returns.hist(bins=20) #plt.show() mean = daily_returns['SPY'].mean() print("mean=", mean) std = daily_returns['SPY'].std() print("std=", std) plt.axvline(mean, color='w', linestyle='dashed', linewidth=2) plt.axvline(std, color='r', linestyle='dashed', linewidth=2) plt.axvline(-std, color='r', linestyle='dashed', linewidth=2) plt.show() print(daily_returns.kurtosis())
num_input = 6 * 2 # Prosody timesteps = 1200 # 60 sec * 20 frames/sec = 1200 num_hidden = 30 # num units in LSTM cell keep_prob_train = 0.75 experiments = [5,10,20,40,60] for experiment in experiments: tf.reset_default_graph() num_output_units = experiment # 20 frames/sec # Reading data print("Reading data...") x_train, y_train, x_test, y_test = data_reader.get_data() y_train = y_train[:,:,0:num_output_units] y_test = y_test[:,:,0:num_output_units] print(x_train.shape) # tf Graph input X = tf.placeholder("float", [None, timesteps, num_input]) Y = tf.placeholder("float", [None, timesteps, num_output_units]) keep_prob = tf.placeholder(tf.float32) # dropout (keep probability) # Define weights/biases weights = { 'hidden1': tf.get_variable("w_hid1", shape=(num_input, num_input), # initializer=tf.random_normal_initializer()), initializer=tf.contrib.layers.xavier_initializer()),
import numpy as np from settings import LIBLINEAR_DIR from data_reader import load, get_data from settings import DATA_DIR sys.path.append(LIBLINEAR_DIR) from liblinearutil import * train_set = 'cifar_train_triplet_100_x.npz' test_set = 'cifar_test_triplet_100_x.npz' # Perform only model selection (finding best C for linear SVM using CV) only_model_selection = False # Save final model save_model = False model_name = 'model_best_triplet' trainx = get_data(train_set) _, trainy = load(DATA_DIR, subset='train') testx = get_data(test_set) _, testy = load(DATA_DIR, subset='test') result = train(trainy, trainx, '-C') if not only_model_selection: m = train(trainy, trainx, '-c ' + str(result[0])) p_label, p_acc, p_val = predict(testy, testx, m) if save_model: save_model(model_name, m)
import numpy as np import matplotlib.pyplot as plt from sklearn.decomposition import PCA from data_reader import load, get_data from settings import DATA_DIR, DATA_SEED samples_per_class = 500 data_type = 'train' # 'test' or 'train' data_name = 'cifar_train_triplet_100_x.npz' datax = get_data(data_name) _, datay = load(DATA_DIR, subset=data_type) pca = PCA(n_components=2) X_new = pca.fit_transform(datax) print(datax.shape) rng_data = np.random.RandomState(DATA_SEED) inds = rng_data.permutation(X_new.shape[0]) X_new = X_new[inds] datay = datay[inds] plt.rcParams["figure.figsize"] = (15, 12) for j in range(10): txs = X_new[datay == j][:samples_per_class] plt.scatter(txs[:, 0], txs[:, 1]) plt.title('PCA 2D transform on ' + data_name, fontsize=20) plt.xlabel('PC1', fontsize=18) plt.ylabel('PC2', fontsize=18) plt.savefig('pca_' + data_name + '.png')
def main(_): kde_models = get_kde_models(FLAGS.kde_model) data = get_data(FLAGS.data) names = [ # "current_slice", "blue", "blue_1", "green", "green_1", "red", "red_1", "nir", "nir_1", "swir1", "swir1_1", "swir2", "swir2_1" ] data = np.array(data[names]) out_count = 2 input_count = 12 x = tf.placeholder(tf.float32, [None, input_count], name='input') with tf.name_scope('weights'): W1 = tf.Variable(tf.truncated_normal([input_count, FLAGS.layer1], stddev=0.5), name='w1') W2 = tf.Variable(tf.truncated_normal([FLAGS.layer1, FLAGS.layer2], stddev=0.5), name='w2') W3 = tf.Variable(tf.truncated_normal([FLAGS.layer2, out_count], stddev=0.5), name='w3') with tf.name_scope('biases'): b1 = tf.Variable(tf.zeros([FLAGS.layer1]), name='b1') b2 = tf.Variable(tf.zeros([FLAGS.layer2]), name='b2') b3 = tf.Variable(tf.zeros([out_count]), name='b3') y = model(x, W1, W2, W3, b1, b2, b3) # Define loss and optimizer y_ = tf.placeholder(tf.float32, [None, out_count], name='target') loss1 = tf.reduce_mean(tf.losses.absolute_difference(labels=y_, predictions=y), name='loss1') tf.summary.scalar('abs diff', loss1) # reg_w = 0.00000001 loss = tf.reduce_mean( tf.losses.mean_squared_error(labels=y_, predictions=y) # + reg_w*(tf.nn.l2_loss(W1)+tf.nn.l2_loss(W2)+tf.nn.l2_loss(W3)), , name='loss') tf.summary.scalar('Regularized loss', loss) optimizer = tf.train.AdamOptimizer(learning_rate=0.01) optimizer = optimizer.minimize(loss) saver = tf.train.Saver(max_to_keep=1) sess = tf.InteractiveSession() merged = tf.summary.merge_all() train_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/train', sess.graph) tf.global_variables_initializer().run() # Train for epoch in range(FLAGS.max_epoch): # получение выборки данных - очень дорогая операция Поэтому будем использовать данные многократно # и, чтобы меньше переобучаться, сразу много batch_xs, batch_ys = get_batch(kde_models, data, 384 * 50, input_count) for i in range(5000): _ = sess.run([optimizer], feed_dict={x: batch_xs, y_: batch_ys}) # Test trained model # if epoch % 100 == 99: summary, train_loss, train_loss1 = sess.run([merged, loss1, loss], feed_dict={ x: batch_xs, y_: batch_ys }) print('EPOCH', epoch + 1, '\tloss', train_loss, '\tloss1', train_loss1) train_writer.add_summary(summary, epoch) saver.save(sess, os.path.join(FLAGS.model_dir, "model")) print_model(FLAGS, 'model')
def main(_): data = get_data(FLAGS.data) train, val, test = split_data(data) out_count = 1 input_count = train.shape[1] - out_count x = tf.placeholder(tf.float32, [None, input_count], name='input') with tf.name_scope('weights'): W1 = tf.Variable(tf.truncated_normal([input_count, FLAGS.layer1], stddev=0.5), name='w1') W2 = tf.Variable(tf.truncated_normal([FLAGS.layer1, FLAGS.layer2], stddev=0.5), name='w2') W3 = tf.Variable(tf.truncated_normal([FLAGS.layer2, input_count], stddev=0.5), name='w3') with tf.name_scope('biases'): b1 = tf.Variable(tf.zeros([FLAGS.layer1]), name='b1') b2 = tf.Variable(tf.zeros([FLAGS.layer2]), name='b2') b3 = tf.Variable(tf.zeros([input_count]), name='b3') logits = model(x, W1, W2, W3, b1, b2, b3) y = tf.nn.relu(logits, name='result') # Define loss and optimizer y_ = tf.placeholder(tf.float32, [None, input_count], name='target') loss1 = tf.reduce_mean( tf.losses.absolute_difference(labels=y_, predictions=y), name='loss1') tf.summary.scalar('abs diff', loss1) reg_w = 0.00000000001 loss = tf.reduce_mean( tf.losses.absolute_difference(labels=y_, predictions=y) + reg_w*(tf.nn.l2_loss(W1)+tf.nn.l2_loss(W2)+tf.nn.l2_loss(W3)), name='loss' ) tf.summary.scalar('Regularized loss', loss) saver = tf.train.Saver(max_to_keep=1) optimizer = tf.train.AdamOptimizer(learning_rate=0.0001) optimizer = optimizer.minimize(loss) sess = tf.InteractiveSession() merged = tf.summary.merge_all() train_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/train', sess.graph) test_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/test') tf.global_variables_initializer().run() # Train best_loss = 9999999999999; for epoch in range(FLAGS.max_epoch): batch_xs, batch_ys = get_batch(train, 2**12, input_count) _, summary, train_loss, train_loss1 = sess.run( [optimizer, merged, loss, loss1], feed_dict={x: batch_xs, y_: batch_xs} ) train_writer.add_summary(summary, epoch) # Test trained model if epoch % 100 == 99: val_loss = sess.run(loss, feed_dict={x: val[:, :input_count], y_: val[:, :input_count]}) print('EPOCH', epoch+1, 'Loss: \tval', sess.run(loss1, feed_dict={x: val[:, :input_count], y_: val[:, :input_count]}), '\ttrain', train_loss1) test_writer.add_summary(summary, epoch) # if val_loss < best_loss: # best_loss = val_loss saver.save(sess, os.path.join(FLAGS.model_dir, "model")) print_model(FLAGS, 'model', test[:, :input_count], test[:, :input_count])
import sys import numpy as np from scipy.stats import mode from sklearn.metrics import accuracy_score from data_reader import load, get_data from settings import DATA_DIR, LIBLINEAR_DIR sys.path.append(LIBLINEAR_DIR) from liblinearutil import * data_train = get_data('cifar_train_triplet_100_x.npz') data_test = get_data('cifar_test_triplet_100_x.npz') # For more general K = 10 cs = 0.250000 _, trainy = load(DATA_DIR, subset='train') _, testy = load(DATA_DIR, subset='test') joined = [] for k in range(K): ind1 = np.random.choice(data_train.shape[0], data_train.shape[0]) trainx_temp = data_train[ind1] trainx_temp = trainx_temp + np.random.normal(0, 0.3, trainx_temp.shape) trainy_temp = trainy[ind1] m = train(trainy_temp, trainx_temp, '-c ' + str(cs)) p_label, p_acc, p_val = predict(testy, data_test, m)
from data_reader import get_data import cv2 import keras import numpy as np from keras.layers import Dense, Flatten from keras.layers import Conv2D, MaxPooling2D, Dropout from keras.models import Sequential from keras import Input, Model import matplotlib.pylab as plt d = get_data() index = 0 images = [] for i in d["smiles"]: img = cv2.imread("images/" + str(index) + ".png") gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) images.append(gray) index += 1 target1_for_training = [] for t in d["target1"]: if t != "": target1_for_training.append(int(t)) else: target1_for_training.append(0) target1_for_training = target1_for_training[0:1000] img_x = 300 img_y = 300
'cifar_train_triplet_1024_x.npz', 'cifar_train_triplet_100_x.npz', 'cifar_train_triplet_2048_x.npz', 'cifar_train_triplet_2048_L2_x.npz', 'cifar_train_x.npz' ] data_tests = [ 'cifar_test_triplet_1024_x.npz', 'cifar_test_triplet_100_x.npz', 'cifar_test_triplet_2048_x.npz', 'cifar_test_triplet_2048_L2_x.npz', 'cifar_test_x.npz' ] cs = [0.015625, 0.031250, 0.031250, 0.250000, 0.031250] _, trainy = load(DATA_DIR, subset='train') _, testy = load(DATA_DIR, subset='test') joined = [] for k in range(len(cs)): trainx = get_data(data_trains[k]) testx = get_data(data_tests[k]) trainx = trainx + np.random.normal(0, 0.3, trainx.shape) m = train(trainy, trainx, '-c ' + str(cs[k])) p_label, p_acc, p_val = predict(testy, testx, m) joined.append(np.expand_dims(p_label, axis=0)) joined = np.transpose(np.concatenate(joined, axis=0), (1, 0)) m_voting = [] for k in range(joined.shape[0]): m_voting.append(mode(joined[k])[0][0]) acc = accuracy_score(m_voting, testy) print(acc)
#setting = [4048, 4048, 1024] #setting = [2048, 1048, 100] setting = [4048, 4048, 2048] ''' '' if we use loss from https://arxiv.org/abs/1704.02227 'L2' if we use loss max(d_+ - d_- + \lambda, 0), where \lambda=10.0''' l_type = 'L2' layers = [LL.InputLayer(shape=(None, 2048))] layers.append(nn.GaussianNoiseLayer(layers[-1], sigma=0.3)) layers.append(nn.DenseLayer(layers[-1], num_units=setting[0])) layers.append(nn.GaussianNoiseLayer(layers[-1], sigma=0.5)) layers.append(nn.DenseLayer(layers[-1], num_units=setting[1])) layers.append(nn.GaussianNoiseLayer(layers[-1], sigma=0.5)) layers.append(nn.DenseLayer(layers[-1], num_units=setting[2])) trainx = get_data('cifar_train_x.npz') _, trainy = load(DATA_DIR, subset='train') print(trainx.shape) x_lab = T.matrix() output_lab = LL.get_output(layers[-1], x_lab, deterministic=False) def get_triplets(prediction, size): a = prediction[0:size] # query case (positive) b = prediction[size:2 * size] # positive case c = prediction[2 * size:3 * size] # negative return a, b, c
def main(flags): sample_size = flags.sample_size model_filename = flags.model_name data = get_data(flags.data) plus = data.loc[(data['change'] == 1)] minus = data.loc[(data['change'] == 0)] names = [ # "current_slice", "blue", "blue_1", "green", "green_1", "red", "red_1", "nir", "nir_1", "swir1", "swir1_1", "swir2", "swir2_1" ] plus = np.array(plus[names]) minus = np.array(minus[names]) if minus.shape[0] > sample_size: idx = np.random.randint(minus.shape[0], size=sample_size) minus_sample = minus[idx, :] else: minus_sample = minus if plus.shape[0] > sample_size: idx = np.random.randint(plus.shape[0], size=sample_size) plus_sample = plus[idx, :] else: plus_sample = plus grid = GridSearchCV(KernelDensity(), {'bandwidth': np.linspace(0.005, 0.025, 11)}, n_jobs=-1, cv=10) # 10-fold cross-validation grid.fit(plus_sample) print('Best bandwidth (plus):', grid.best_params_) kde_plus = grid.best_estimator_ # import ipdb; ipdb.set_trace() grid.fit(minus_sample) print('Best bandwidth (minus):', grid.best_params_) kde_minus = grid.best_estimator_ density_plus_p = kde_plus.score_samples(plus) density_plus_m = kde_plus.score_samples(minus) density_minus_p = kde_minus.score_samples(plus) density_minus_m = kde_minus.score_samples(minus) true_plus = density_plus_p > density_minus_p true_plus_prop = 1.0 * sum(true_plus.astype(np.int))/len(true_plus) true_minus = density_minus_m > density_plus_m true_minus_prop = 1.0 * sum(true_minus.astype(np.int)) / len(true_minus) print('True plus:', true_plus_prop) print('True minus:', true_minus_prop) models = {'plus_model': kde_plus, 'minus_model': kde_minus} pickle.dump(models, open(model_filename, 'wb'))
area_object = { 'id': int(elf_id[1:]), 'x': int(top), 'y': int(left[:-1]), 'width': int(width), 'height': int(height) } refactored.append(area_object) return refactored def create_matrix(obj): fab = np.zeros((1000, 1000), dtype=np.int) for o in obj: area = fab[o['y']:o['y'] + o['height'], o['x']:o['x'] + o['width']] area[:] = area + 1 for o in obj: area = fab[o['y']:o['y'] + o['height'], o['x']:o['x'] + o['width']] if np.sum(np.where(area == 1, 0, 2)) < 1: print(o['id']) return fab if __name__ == '__main__': initial_data = get_data(False) data = refactor_data(initial_data) fabric = create_matrix(data)
def _train(args, pretrain_args): """Train the language model. Creates train/valid/test models, runs training epochs, saves model and writes results to database if specified. """ start_time = time.time() print('Training', ', '.join(args.speakers), '...') # randomly sample validation set monte_carlo_cv_num times for num in range(args.monte_carlo_cv_num): # get seed used to sub-sample validation dataset (use 42 for 1st run) seed = utils.get_seed(num) # get train/valid/test data and convert to sequences train_data, valid_data, test_data, id_to_word = data_reader.get_data( args, seed=seed) # set configurations/hyperparameters for model config, test_config = utils.set_config(args, id_to_word) # initialize word embeddings init_embed = utils.init_embedding(id_to_word, dim=args.embed_size, init_scale=args.init_scale, embed_path=args.embed_path) with tf.Graph().as_default(): # initializer used to initialize TensorFlow variables initializer = tf.random_uniform_initializer( -config['init_scale'], config['init_scale']) # create Train model with tf.name_scope('Train'): with tf.variable_scope('Model', reuse=None, initializer=initializer): m_train = model.Model(args, is_training=True, config=config, init_embed=init_embed, name='Train') m_train.build_graph() # create Valid model with tf.name_scope('Valid'): with tf.variable_scope('Model', reuse=True, initializer=initializer): m_valid = model.Model(args, is_training=False, config=config, init_embed=init_embed, name='Valid') m_valid.build_graph() # create Test model with tf.name_scope('Test'): with tf.variable_scope('Model', reuse=True, initializer=initializer): m_test = model.Model(args, is_training=False, config=test_config, init_embed=init_embed, name='Test') m_test.build_graph() # create summaries to be viewed in TensorBoard tb_summaries = utils.TensorBoardSummaries() tb_summaries.create_ops() init = tf.global_variables_initializer() # if pretrained, must create dict to initialize TF Saver if bool(pretrain_args): # get trainable variables and convert to dict for Saver reuse_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES) reuse_vars_dict = dict([(var.op.name, var) for var in reuse_vars]) # create saver for TF session (see function for addl details) saver = utils.create_tf_saver(args, pretrain_args, reuse_vars_dict) else: saver = tf.train.Saver() # ppls dict has perplexities that are stored in results database ppls = {} ppls, _ = _update_ppls(ppls, initialize=True) with tf.Session() as sess: sess.run(init) if args.load_path != '': print('Restoring model...') saver.restore(sess, args.load_path) for epoch in range(config['max_epoch']): print('Epoch: {0} Learning rate: {1:.3f}\n'.format( epoch + 1, sess.run(m_train.lr))) for i, speaker in enumerate(args.speakers): print('Training {0} ...'.format(speaker)) # run epoch on training data train_perplexity = _run_epoch( sess, m_train, args, train_data, i, tb_summaries, id_to_word, train_op=m_train.train_op, verbose=True) print('Epoch: {0} Train Perplexity: {1:.3f}'.format( epoch + 1, train_perplexity)) ppls, _ = _update_ppls(ppls, epoch=epoch + 1, speaker=speaker, ppl=train_perplexity, dataset='train') print('Validating...') # run epoch on validation data valid_perplexity = _run_epoch(sess, m_valid, args, valid_data, i, tb_summaries, id_to_word, verbose=True) print('Epoch: {0} Valid Perplexity: {1:.3f}'.format( epoch + 1, valid_perplexity)) ppls, improved = _update_ppls(ppls, epoch=epoch + 1, speaker=speaker, ppl=valid_perplexity, dataset='valid') if improved: # save model if valid ppl is lower than current # best valid ppl if args.save_path != '': print('Saving model to {0}.'.format( args.save_path)) saver.save(sess, args.save_path) for i, speaker in enumerate(args.speakers): print('Testing {0} ...'.format(speaker)) print('Restoring best model for testing...') saver.restore(sess, args.save_path) # run model on test data test_perplexity = _run_epoch(sess, m_test, args, test_data, i) ppls['test_ppl_' + speaker] = test_perplexity print('Test Perplexity: {0:.3f}'.format(test_perplexity)) if args.insert_db == 'True': # write params/config/results to sql database results_db.insert_results(args, config, start_time, ppls)