def main(args): data_train, gold_train = get_data(args.train_path) data_dev, gold_dev = get_data(args.dev_path) word_to_int, int_to_word = get_vocab(data_train, args.min_word_count) vocab_size = len(word_to_int) max_len = 100 train_loader = DataLoader(data_train, gold_train, args.batch_size, word_to_int, args.transformer) dev_loader = TestLoader(data_dev, gold_dev, word_to_int, args.transformer) lossFunction = nn.CrossEntropyLoss() if args.transformer: model = Model_T(args.embed_size, args.hidden_size, args.inter_size, vocab_size, max_len, args.n_heads, args.n_layers, args.per_layer, args.dropout_prob_classifier, args.dropout_prob_attn, args.dropout_prob_hidden, args.use_elmo, args.num_rep, args.elmo_drop).cuda() elif args.BiLSTM: model = Model_B(args.embed_size, args.hidden_size, vocab_size, args.use_elmo, args.num_rep, args.elmo_drop).cuda() optimizer = optim.Adam(model.parameters(), lr=args.lr) train(model, optimizer, lossFunction, train_loader, dev_loader, args.epochs, args.eval_every)
def __init__(self, networkType, numIterations, populationSize, selection, crossover, crossoverProb, mutationProb, nnParams): self.networkType = networkType self.numberOfIterations = numIterations self.populationSize = populationSize self.selection = selection self.crossover = crossover self.crossoverProb = crossoverProb self.mutationProb = mutationProb self.nnParams = nnParams #User specified neural network if (self.networkType == "nn"): sys.stdout.flush() #Load data from cifar10 dataset and 10 output neurons self.train_train_data = dl.get_data("nn",'cifar10',10) #User specified convolutional neural network else: sys.stdout.flush() #Load data from cifar10 self.x_y_train_data = dl.get_data("cnn",'cifar10') self.population = self.initializePopulation()
def print_accuracy(config, model): classifier = KNeighborsClassifier(n_neighbors=1) X, y = get_data(config) X, y = to_var(X), to_var(y) if model is not None: X = model.g(X) X, y = to_data(X), to_data(y) classifier.fit(X.reshape(X.shape[0], -1), y.reshape(-1)) Xhat, yhat = get_data(config, train=False) Xhat, yhat = to_var(Xhat), to_var(yhat) Xhat, yhat = to_data(Xhat), to_data(yhat) pred = classifier.predict(Xhat.reshape(Xhat.shape[0], -1)) accuracy = (pred == yhat.reshape(-1)).astype(float).mean() print('Classification accuracy: %0.4f' % (accuracy))
def main(args): SEED = 17 np.random.seed(SEED) torch.manual_seed(SEED) random.seed(SEED) torch.cuda.manual_seed(SEED) net = nm.Net() path = './trained_model_best.pth' d = torch.load(path) net.load_state_dict(d['state_dict']) net.eval() net.cuda() train_loader, test_loader = dl.get_data(args.data) val_loss = 0. criterion = torch.nn.MSELoss() for batch_idx, batch in enumerate(test_loader): inputs = torch.autograd.Variable(batch).cuda() with torch.no_grad(): output = net(inputs) loss = criterion(output, inputs) val_loss += loss.cpu().data.numpy() * len(inputs) val_loss = val_loss / len(test_loader.dataset) print(val_loss)
def solve_sequential(): X, y, n, dim = data_loader.get_data() X_trans = X.transpose() iteration_cnt = 10000 opt_loss = optimal_solve(X, y, n) bt_beta = 0.5 bt_alpha = 0.1 current_beta = np.zeros(dim + 1) for i in range(iteration_cnt): gradient = X.dot(current_beta) - y gradient = X_trans.dot(gradient) gradient_norm = np.linalg.norm(gradient, ord=2) gradient_norm_sq = gradient_norm ** 2 step_size = 1 # back track line search while target_func(X, y, current_beta - step_size * gradient) > \ target_func(X, y, current_beta) - bt_alpha * step_size * gradient_norm_sq: step_size *= bt_beta current_beta = current_beta - step_size * gradient diff = target_func(X, y, current_beta) - opt_loss diff /= n print(math.log(diff))
def train_network(): print('Loading data') (data, labels) = data_loader.get_data() print('Dividing data') x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=20) print(x_train[0].shape) # This part could be changed model = tf.keras.models.Sequential([ tf.keras.layers.Flatten(input_shape=(256, 256)), tf.keras.layers.Dense(120, activation=tf.nn.relu), tf.keras.layers.Dropout(0.2), tf.keras.layers.Dense(10, activation=tf.nn.softmax) ]) model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy']) model.fit(x_train, y_train, batch_size=128, epochs=5) a = model.evaluate(x_test, y_test) print(a) save_network(model)
def solve_sequential(): X, y, n, dim = data_loader.get_data() lasso_lam = 1 opt_val = float('inf') X_trans = X.transpose() iteration_cnt = 10000 current_beta = np.zeros(dim + 1) for i in range(iteration_cnt): rate = 1 shrink_beta = 0.5 while True: op = X_trans.dot(y - X.dot(current_beta)) gradient = -op op = rate * op op = current_beta + op try_beta = soft_thres_op(lasso_lam * rate, op) gtbeta = (current_beta - try_beta) / rate if func_g(X, y, try_beta) > func_g( X, y, current_beta) - rate * gtbeta.dot(gradient) + ( (rate / 2) * (np.linalg.norm(gtbeta, ord=2))**2): rate *= shrink_beta else: current_beta = try_beta break opt_val = min(opt_val, target_func(X, y, lasso_lam, current_beta)) print(opt_val)
def train(dataset: str, learning_rate: float, epochs: int, device: str, model: str, beta: float): def kl_divergence(means, log_sigma, target_sigma=1.): kl_div = target_sigma**-2 * means**2 + 2 * np.log(target_sigma) kl_div += -2 * log_sigma + torch.exp(2 * log_sigma) / target_sigma**2 return torch.mean(torch.sum(kl_div, dim=1)) device = torch.device(device if torch.cuda.is_available() else 'cpu') train_loader, valid_loader = get_data(dataset) net_setup = { 'oscillator': { 'input_size': 50, 'input2_size': 1, 'latent_size': 2, 'output_size': 1, 'encoder_units': [100, 100], 'decoder_units': [100, 100] }, 'collision': { 'input_size': 30, 'input2_size': 16, 'latent_size': 1, 'output_size': 2, 'encoder_units': [150, 100], 'decoder_units': [100, 150] } }[dataset] network = SciNet(**net_setup) network = network.to(device) if model: network.load_state_dict(torch.load(model)) print("Restored weights") mse_loss = torch.nn.MSELoss() optimizer = torch.optim.Adam(network.parameters(), lr=learning_rate) for epoch in tqdm(range(epochs)): for i, (x, t, y) in enumerate(train_loader): x, t, y = x.to(device), t.to(device), y.to(device) optimizer.zero_grad() mu, log_sigma, output = network(x, t) loss = mse_loss(output, y) + beta * kl_divergence(mu, log_sigma) loss.backward() torch.nn.utils.clip_grad.clip_grad_value_(network.parameters(), 10.0) optimizer.step() print(loss.item()) torch.save(network.state_dict(), f'{dataset}.pth')
def run(args): logger.info('Read data:') train_A, train_B, test_A, test_B = get_data(args.task, args.image_size) logger.info('Build graph:') model = CycleGAN(args) variables_to_save = tf.global_variables() init_op = tf.variables_initializer(variables_to_save) init_all_op = tf.global_variables_initializer() saver = FastSaver(variables_to_save) logger.info('Trainable vars:') var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) for v in var_list: logger.info(' %s %s', v.name, v.get_shape()) if args.load_model != '': model_name = args.load_model else: model_name = '{}_{}'.format( args.task, datetime.now().strftime("%Y-%m-%d_%H-%M-%S")) logdir = '/content/drive/My Drive/Colab Notebooks/Cycle/logs' makedirs(logdir) logdir = os.path.join(logdir, model_name) logger.info('Events directory: %s', logdir) summary_writer = tf.summary.FileWriter(logdir) def init_fn(sess): logger.info('Initializing all parameters.') sess.run(init_all_op) sv = tf.train.Supervisor( is_chief=True, logdir=logdir, saver=saver, summary_op=None, init_op=init_op, init_fn=init_fn, summary_writer=summary_writer, ready_op=tf.report_uninitialized_variables(variables_to_save), global_step=model.global_step, save_model_secs=300, save_summaries_secs=30) if args.train: logger.info("Starting training session.") with sv.managed_session() as sess: model.train(sess, summary_writer, train_A, train_B) logger.info("Starting testing session.") with sv.managed_session() as sess: base_dir = os.path.join( '/content/drive/My Drive/Colab Notebooks/Cycle/results', model_name) makedirs(base_dir) model.test(sess, test_A, test_B, base_dir)
def get_data_linear(): data, labels, columns_dict, values_dict = get_data() # preprocess the diseases disease_dict = {} key_to_disease_string = {} for k in values_dict[DISEASE_LABEL]: diseases = map(str.lower, map(str.strip, k.split(';'))) key_to_disease_string[values_dict[DISEASE_LABEL][k]] = k for d in diseases: if d not in disease_dict: disease_dict[d] = len(disease_dict) print("DISEASES:", len(disease_dict)) # end disease preprocess # DANGEROUS: RAN ONCE TO FIGURE OUT NUM OF ROWS. NUM_ROWS = 5528 global NUM_COLS global NUM_COLS_INITIALIZED if not NUM_COLS_INITIALIZED: for k in columns_dict: if k in FLOAT_LABELS: NUM_COLS += 1 elif k not in IGNORE_LABELS: NUM_COLS += len(values_dict[k].items()) NUM_COLS_INITIALIZED = True if INCLUDE_DISEASES: NUM_COLS += len(disease_dict) # do not change the above unless amount of data changes. index_labels = {} for k in columns_dict: index_labels[columns_dict[k]] = k linearized_data = np.zeros((NUM_ROWS, NUM_COLS)) print("SHAPE OF DATA:", linearized_data.shape) for i, d in enumerate(data): write_index = 0 for j, val in enumerate(d): if index_labels[j] in FLOAT_LABELS: linearized_data[i, write_index] = val write_index += 1 elif INCLUDE_DISEASES and index_labels[j] == DISEASE_LABEL: diseases = map( str.lower, map(str.strip, key_to_disease_string[val].split(';'))) for dis in diseases: linearized_data[i, write_index + disease_dict[dis]] = 1 write_index += len(disease_dict) elif index_labels[j] in IGNORE_LABELS: continue else: assert val == int(val), 'Value must be a value index' linearized_data[i, write_index + int(val)] = 1 write_index += len(values_dict[index_labels[j]].items()) assert write_index == NUM_COLS return linearized_data, labels
def train(args): print("Getting Dataset...") source, target = data_loader.get_data( args.data_dir) # TODO make target data print("Initializing the Model...") model = EncDecModel(args) print("Starting training...") with tf.Session() as sess: tf.initialize_all_variables().run() saver = tf.train.Saver(tf.all_variables()) for i in range(args.epoches): batch_x = source[i + 0:i + args.batch_size * args.time_steps * args.input_size] charlist = source[i + 1:i + args.batch_size + 1] batch_y = np.zeros((len(charlist), args.output_classes)) for j in range(len(charlist) - 1): batch_y[j][int(charlist[j])] = 1.0 # Reshape batch to input size batch_x = batch_x.reshape( (args.batch_size, args.time_steps, args.input_size)) feed = {model.X: batch_x, model.Y: batch_y} sess.run(model.optimizer, feed_dict=feed) if i % args.display_step == 0: # Calculate Accuracy summary, acc = sess.run([model.acc_summary, model.accuracy], feed_dict=feed) print "Step: " + str(i) + ", Training Accuracy: " + str(acc) if i % 100 == 0 and not (i == 0): seq = '' x_inp = batch_x for j in range(140): index = model.hypothesis_index.eval({ model.X: x_inp, model.Y: batch_y }) next_letter = unichr(index[0]) x_inp = source[i + 0 + 1 + j:i + args.batch_size * args.time_steps * args.input_size + 1 + j] x_inp[-1] = float(ord(next_letter)) x_inp = x_inp.reshape( (args.batch_size, args.time_steps, args.input_size)) seq += next_letter with open('save/gen' + str(i) + '.txt', 'w+') as f: print "save:\n" + seq f.write(seq) f.close() if i % 1000 == 0 and not (i == 0): saver.save(sess, "save/" + str(i) + ".ckpt") print "Training is COMPLETE!" x_test = source[i:i + args.batch_size] y_test = source[i * args.batch_size + 1:i * 2 * args.batch_size] test_accuracy = sess.run(model.accuracy, feed_dict=feed) print("Final test accuracy: %g" % (test_accuracy)) saver.save(sess, "save/model.ckpt")
def get_model(model, ckpt_name, option): model_path = ospj('train_log', option.log_dir, ckpt_name) ds = get_data('val', option) pred_config = PredictConfig( model=model, session_init=get_model_loader(model_path), input_names=['input', 'label', 'bbox'], output_names=['wrong-top1', 'top5', 'actmap', 'grad'], return_input=True) return SimpleDatasetPredictor(pred_config, ds)
def main(args): SEED = 17 np.random.seed(SEED) torch.manual_seed(SEED) random.seed(SEED) torch.cuda.manual_seed(SEED) torch.backends.cudnn.deterministic = True #train_loader, test_loader = dl.get_data(args.data) train_loader, test_loader = dl.get_data("") trainer.train_network(train_loader, test_loader)
def train(args): print("Getting Dataset...") source, target = data_loader.get_data(args.data_dir) # TODO make target data print("Initializing the Model...") model = EncDecModel(args) print("Starting training...") with tf.Session() as sess: tf.initialize_all_variables().run() saver = tf.train.Saver(tf.all_variables()) for i in range(args.epoches): batch_x = source[i+0 : i+args.batch_size * args.time_steps * args.input_size] charlist = source[i+1 : i+args.batch_size+1] batch_y = np.zeros((len(charlist), args.output_classes)) for j in range(len(charlist)-1): batch_y[j][int(charlist[j])] = 1.0 # Reshape batch to input size batch_x = batch_x.reshape((args.batch_size, args.time_steps, args.input_size)) feed = {model.X: batch_x, model.Y: batch_y} sess.run(model.optimizer, feed_dict=feed) if i % args.display_step == 0: # Calculate Accuracy summary, acc = sess.run([model.acc_summary, model.accuracy], feed_dict=feed) print "Step: " + str(i) + ", Training Accuracy: " + str(acc) if i % 100 == 0 and not(i==0): seq = '' x_inp = batch_x for j in range(140): index = model.hypothesis_index.eval({ model.X: x_inp, model.Y: batch_y }) next_letter = unichr(index[0]) x_inp = source[i+0+1+j:i+args.batch_size*args.time_steps*args.input_size+1+j] x_inp[-1] = float(ord(next_letter)) x_inp = x_inp.reshape((args.batch_size,args.time_steps,args.input_size)) seq += next_letter with open('save/gen' + str(i) + '.txt', 'w+') as f: print "save:\n" +seq f.write(seq) f.close() if i % 1000 == 0 and not(i == 0): saver.save(sess,"save/" +str(i) +".ckpt") print "Training is COMPLETE!" x_test = source[i:i+args.batch_size] y_test = source[i*args.batch_size+1:i*2*args.batch_size] test_accuracy = sess.run(model.accuracy, feed_dict=feed) print ("Final test accuracy: %g" %(test_accuracy)) saver.save(sess,"save/model.ckpt")
def run_model(): data, true_labels = ldl.get_data_linear() true_buckets = [util.bucket(t) for t in true_labels] data = np.tile(data, (DATA_MULTIPLIER, 1)) print("DATA SHAPE:", data.shape) true_buckets = np.tile(true_buckets, DATA_MULTIPLIER) # tuples of (batch_id, total regret, error while training, eval error, precision, recall) batch_results = [] for T in range(NUM_BATCHES): model = Lin_UCB(ALPHA) #model = LASSO_BANDIT() if False: data, true_labels, columns_dict, values_dict = dl.get_data() true_buckets = [util.bucket(t) for t in true_labels] #model = Fixed_Dose(columns_dict, values_dict) #model = Warfarin_Clinical_Dose(columns_dict, values_dict) #model = Warfarin_Pharmacogenetic_Dose(columns_dict, values_dict) batch_id = str(random.randint(100000, 999999)) print() print("Start Batch: ", batch_id) zipped_data = list(zip(data, true_buckets)) random.shuffle(zipped_data) data, true_buckets = zip(*zipped_data) data = np.array(data) model.train(data, true_buckets) pred_buckets = model.evaluate(data) print(batch_id, "Performance on " + str(model)) acc, precision, recall = util.evaluate_performance( pred_buckets, true_buckets) print("\tAccuracy:", acc) print("\tPrecision:", precision) print("\tRecall:", recall) plot_regret(model.regret, ALPHA, batch_id) plot_error_rate(model.error_rate, ALPHA, batch_id) batch_results.append( (batch_id, model.get_regret()[-1], model.get_error_rate()[-1], 1 - acc, precision, recall)) with open('batch/regret' + str(model) + batch_id, 'wb') as fp: pickle.dump(model.regret, fp) with open('batch/error' + str(model) + batch_id, 'wb') as fp: pickle.dump(model.error_rate, fp) return batch_results
def train_model(): (data, label) = data_loader.get_data() data = data.reshape((len(data), 256 * 256)) x_train, x_test, y_train, y_test = train_test_split(data, label, test_size=0.2, random_state=20) clf = LogisticRegression(solver='lbfgs', multi_class='multinomial') clf.fit(x_train, y_train) save_model(clf)
def train_model(model_name='random_forest', verbose=False): X_train, X_test, y_train, y_test = get_data(num_days_per_month=7) # train model model = build_train_model(model_name, X_train, y_train, vb=verbose) # evaluate model evaluate(model, X_test, y_test, model_name=model_name, plot_probs=True) # save model os.chdir("../../../../../../home/sdas11/") joblib.dump(model, f'{model_name}.model') feat_imp(model, model_name=model_name)
def setUp(self): """ This method sets the variables for the following tests. """ self._m = 100 self._n = 30 self._k = 5 self._increment = 20 self._A = get_data(ExperimentType.ExampleNo2)(self._m, np.arange(2 * self._k).astype(float)) self._approximation = random_svd(self._A, self._k, self._increment) self._U = self._approximation.U self._sigma = np.array(self._approximation.sigma) self._VT = self._approximation.V.T self._approximation = self._approximation.as_numpy_arr() self._A = self._A.as_numpy_arr()
def setUp(self): """ This method sets the variables for the following tests. """ self._m = 100 self._n = 30 self._k = 5 self._increment = 20 self._A = get_data(ExperimentType.ExampleNo2)(self._m, np.arange(2 * self._k).astype(float)) self._approximation = random_id(self._A, self._k, self._increment) self._B = self._approximation.B self._P = np.array(self._approximation.P) self._A = self._A.as_numpy_arr() self._n = self._A.shape[1] self._approximation = self._approximation.as_numpy_arr()
def config(): """ Config section This function contains all possible configuration for all experiments. Full details on each configuration values can be found in :mod:`enums.py`. """ experiment_type: str = ExperimentType.ExampleNo5 singular_values: RowVector = choose_singular_values(experiment_type) used_data_factory: Callable = get_data(experiment_type) data_sizes: List = choose_data_sizes(experiment_type) approximation_ranks: List = choose_approximation_ranks(experiment_type) increments: List = choose_increments(experiment_type) results_path: str = r'Results/' power_method_iterations: int = 100
def train_all(verbose=False): X_train, X_test, y_train, y_test = get_data(num_days_per_month=7) for model_name in ['NN', 'random_forest', 'xgboost_clf', 'logistic_regression', 'svm', 'naive_bayes']: print(model_name.upper(), ':') # train model model = build_train_model(model_name, X_train, y_train, vb=verbose) # evaluate model evaluate(model, X_test, y_test, model_name=model_name, plot_probs=False) # save model os.chdir("../../../../../../home/sdas11/") joblib.dump(model, f'{model_name}.model') print('\n') feat_imp(model, model_name=model_name, show_plot=False) print('\n')
def __init__(self): _, self.data_dict, all_features, all_objects = data_loader.get_data() self.unexamined_features = all_features[:] for f in feature_blacklist: self.unexamined_features.remove(f) self.known_features = {} self.probabilities = {} self.probability_features_all_objects = {} self.questions_asked = 0 self.unexamined_objects = all_objects[:] self.ordered_probs = [] for o in self.unexamined_objects: self.probability_features_all_objects[o] = 1.0 self.probabilities[o] = self.prior_probability(o) self.cur_entropy = self.entropy(self.probabilities)
def run(target, start=0, iteration=10, batch_size=256, epochs=500, out='./model', name=None): x_train, y_train, x_test = get_data(target=target, name=name) print(f'Kind of Data: {len(x_train)}') x_pred_list, y_pred_list, m_pred_list, v_pred_list = [], [], [], [] for i in range(iteration): model = set_model(target, x_train, out=out, name=name) hist = train(model, x_train, y_train, epochs=epochs, batch_size=batch_size, label=LABEL[target], seq=start+i, out=out) best_model = load_best_model(target, label=LABEL[target], seq=start+i, out=out) pred_data_test = best_model.predict(x_test) if target == 0: x_pred_list.append(pred_data_test[:,0]) y_pred_list.append(pred_data_test[:,1]) if target == 1 or target == 3: m_pred_list.append(pred_data_test[:,2]) if target == 2 or target == 3: v_pred_list.append(pred_data_test[:,3]) if target == 0: x_pred = np.mean(x_pred_list, axis=0) y_pred = np.mean(y_pred_list, axis=0) if target == 1 or target == 3: m_pred = np.mean(m_pred_list, axis=0) if target == 2 or target == 3: v_pred = np.mean(v_pred_list, axis=0) # submit submit = pd.read_csv('xy_mlp_ensemble10_2nd.csv') if target == 0: submit.iloc[:, 1] = x_pred submit.iloc[:, 2] = y_pred if target == 1 or target == 3: submit.iloc[:, 3] = m_pred if target == 2 or target == 3: submit.iloc[:, 4] = v_pred submit.to_csv(os.path.join(out, 'mv_ensemble10.csv'), index=None)
def train(): # the data, split between train and test sets (data, labels) = data_loader.get_data() x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=20) x_train = x_train.reshape(len(x_train), 256 * 256) x_test = x_test.reshape(len(x_test), 256 * 256) x_train = x_train.astype('float32') x_test = x_test.astype('float32') pca = PCA(n_components=50) x_train = pca.fit_transform(x_train) x_test = pca.transform(x_test) svm = SVC() parameters = [{ 'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000] }] # {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}] grid = GridSearchCV(svm, parameters, verbose=3) grid.fit(x_train[0:7000], y_train[0:7000]) # grid search learning the best parameters print(grid.best_params_) # Now we train the best estimator in the full dataset best_svm = grid.best_estimator_ best_svm.fit(x_train, y_train) print("svm done") print("Testing") print("score: ", best_svm.score( x_test, y_test, ))
def solve_distributed(): X, y, n, dim = data_loader.get_data() X_trans = X.transpose() iteration_cnt = 100 opt_loss = optimal_solve(X, y) all_staleness = [0, 5, 10, 20, 50] colors = ['yellow', 'blue', 'black', 'red', 'green'] worker_cnt = 4 bt_beta = 0.5 bt_alpha = 0.1 worker_data_X = [] worker_data_X_trans = [] worker_data_Y = [] each_worker_data_cnt = n // worker_cnt for i in range(worker_cnt): worker_data_X.append(X[i * each_worker_data_cnt:(i + 1) * each_worker_data_cnt]) worker_data_Y.append(y[i * each_worker_data_cnt:(i + 1) * each_worker_data_cnt]) worker_data_X_trans.append(worker_data_X[i].transpose()) for staleness in all_staleness: event_seq = seq_gen.gen_event_sequence(worker_cnt, staleness, iteration_cnt) worker_param = {i: np.zeros(dim + 1) for i in range(worker_cnt)} current_server_beta = np.zeros(dim + 1) current_loss = [] for event in event_seq: event_type, worker_num = event.split("_") worker_num = int(worker_num) assert (event_type in ["push", "pull"]) if event_type == "push": local_X = worker_data_X[worker_num] local_X_trans = worker_data_X_trans[worker_num] local_y = worker_data_Y[worker_num] local_beta = worker_param[worker_num] gradient = local_X.dot(local_beta) - local_y gradient = local_X_trans.dot(gradient) gradient_norm = np.linalg.norm(gradient, ord=2) gradient_norm_sq = gradient_norm**2 step_size = 1 # back track line search while target_func(X, y, current_server_beta - step_size * gradient) > \ target_func(X, y, current_server_beta) - bt_alpha * step_size * gradient_norm_sq: step_size *= bt_beta current_server_beta -= step_size * gradient diff = target_func(X, y, current_server_beta) - opt_loss diff /= n current_loss.append(math.log(diff)) print(math.log(diff)) else: worker_param[worker_num] = np.copy(current_server_beta) plt.plot([i for i in range(len(current_loss))], list(current_loss), color=colors[all_staleness.index(staleness)], label=('Staleness: ' + str(staleness))) plt.title('Linear regression with backtrack linea search') plt.legend() plt.show()
def test_data_baseline(alg, data, true_labels): print() alg.train(data, true_labels) labels = list(map(util.bucket, alg.evaluate(data))) true_labels = list(map(util.bucket, true_labels)) print("##### " + str(alg) + "#####") #acc = util.get_accuracy_bucketed(labels, true_labels) #print("accuracy on data with " + str(alg) + ": " + str(acc)) acc, precision, recall = util.evaluate_performance(labels, true_labels) #print("bucket accuracy with " + str(alg) + ":" + str(bucket_acc)) if __name__ == '__main__': data, true_labels, columns_dict, values_dict = dl.get_data() fixed = Fixed_Dose(columns_dict, values_dict) #test_dummy_baseline(fixed) test_data_baseline(fixed, data, true_labels) clinical = Warfarin_Clinical_Dose(columns_dict, values_dict) test_data_baseline(clinical, data, true_labels) pharma = Warfarin_Pharmacogenetic_Dose(columns_dict, values_dict) test_data_baseline(pharma, data, true_labels)
exp_io.set_logging( args.log_level, args.log_file, filemode=args.log_filemode, ) args.out_file = os.path.join(args.output_dir, args.out_file + '.csv') return args if __name__ == '__main__': args = parse_args() features, labels = data_loader.get_data( data_path=args.in_file, k=args.lsb_bits, ) rnn = BranchRNN( #features.shape[1], 1, units=args.units, hidden_layers=args.hidden_layers, window_size=args.window_size, tfboard_log=args.tfboard_log, update_freq=args.tfboard_freq, cudnn=args.cudnn, gru=args.gru, batch_size=args.batch_size, history_size=args.history_size, epochs=args.epochs,
def solve_distributed_fixed(): X, y, n, dim = data_loader.get_data() X_trans = X.transpose() iteration_cnt = 5000 # step size 0.000001 , staleness = 5 converges, staleness = 100 diverges step_size = 0.0000007 opt_loss = optimal_solve(X, y) all_staleness = [0, 5, 10, 20, 50] colors = ['yellow', 'blue', 'black', 'red', 'green'] worker_cnt = 4 worker_data_X = [] worker_data_X_trans = [] worker_data_Y = [] each_worker_data_cnt = n // worker_cnt for i in range(worker_cnt): worker_data_X.append(X[i * each_worker_data_cnt:(i + 1) * each_worker_data_cnt]) worker_data_Y.append(y[i * each_worker_data_cnt:(i + 1) * each_worker_data_cnt]) worker_data_X_trans.append(worker_data_X[i].transpose()) for staleness in all_staleness: event_seq = seq_gen.gen_event_sequence(worker_cnt, staleness, iteration_cnt) worker_param = {i: np.zeros(dim + 1) for i in range(worker_cnt)} current_server_beta = np.zeros(dim + 1) current_loss = [] for event in event_seq: event_type, worker_num = event.split("_") worker_num = int(worker_num) assert (event_type in ["push", "pull"]) if event_type == "push": local_X = worker_data_X[worker_num] local_X_trans = worker_data_X_trans[worker_num] local_y = worker_data_Y[worker_num] local_beta = worker_param[worker_num] gradient = local_X.dot(local_beta) - local_y gradient = local_X_trans.dot(gradient) current_server_beta -= step_size * gradient diff = target_func(X, y, current_server_beta) - opt_loss diff /= n current_loss.append(math.log(diff)) print(math.log(diff)) else: worker_param[worker_num] = np.copy(current_server_beta) loss_to_plot = [ current_loss[4 * i] for i in range(len(current_loss) // 4) ] plt.plot([i for i in range(len(loss_to_plot))], loss_to_plot, color=colors[all_staleness.index(staleness)], label=('Staleness: ' + str(staleness))) # plt.plot([i for i in range(len(current_loss))], list(current_loss), color=colors[all_staleness.index(staleness)], label=('Staleness: ' + str(staleness))) plt.title('Linear regression with learning rate: ' + str(step_size)) plt.legend() plt.show()
def run(): print('Loading data...') #data_folder = '../data/weibo_xiaoice_large' training, validation, test, embedding_matrix, label_map, VOCAB = get_data( USE_FA=False, USE_GLOVE_EMBED=True) tr_gen = batch_generator(training[0], training[-1], batch_size=256, shuffle=True) te_gen = batch_generator(validation[0], validation[-1], batch_size=1024, shuffle=False) print('VOCAB size:{}'.format(VOCAB)) # Summation of word embeddings LAYERS = 1 USE_GLOVE = True TRAIN_EMBED = False EMBED_HIDDEN_SIZE = 256 SENT_HIDDEN_SIZE = 256 BATCH_SIZE = 512 PATIENCE = 6 # 8 MAX_EPOCHS = 100 MAX_LEN_TITLE = 30 MAX_LEN_DES = 128 DP = 0.2 L2 = 4e-06 ACTIVATION = 'relu' OPTIMIZER = 'rmsprop' # OPTIMIZER = 'adadelta' MLP_LAYER = 1 NGRAM_FILTERS = [1, 2, 3, 4] NUM_FILTER = 128 RNN_Cell = 'BiLSTM' print('Embed / Sent = {}, {}'.format(EMBED_HIDDEN_SIZE, SENT_HIDDEN_SIZE)) print('GloVe / Trainable Word Embeddings = {}, {}'.format( USE_GLOVE, TRAIN_EMBED)) LABEL_NUM = len(label_map.classes_) bst_model_path = '../model/rnn_attn_v2.hdf5' pred_path = '../res/rnn_attn_v2.pkl' res_path = '../res/rnn_attn_v2.csv' embed_title = get_embedding(embedding_matrix, USE_GLOVE, VOCAB, EMBED_HIDDEN_SIZE, TRAIN_EMBED, MAX_LEN_TITLE) embed_des = get_embedding(embedding_matrix, USE_GLOVE, VOCAB, EMBED_HIDDEN_SIZE, TRAIN_EMBED, MAX_LEN_DES) model = rnn_att_model(embed_title, embed_des, MAX_LEN_TITLE, MAX_LEN_DES, SENT_HIDDEN_SIZE, ACTIVATION, DP, L2, LABEL_NUM, OPTIMIZER, MLP_LAYER, LAYERS, RNN_Cell='BiLSTM') early_stopping = EarlyStopping(monitor='val_top_k_categorical_accuracy', patience=4) model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True) #print training[0][0].shape[0],validation[0][0].shape[0] #model.load_weights(bst_model_path) model.fit_generator( tr_gen, steps_per_epoch=int(training[0][0].shape[0] / BATCH_SIZE) + 1, epochs=100, verbose=1, validation_data=te_gen, validation_steps=int(validation[0][0].shape[0] / BATCH_SIZE) + 1, max_q_size=20, callbacks=[early_stopping, model_checkpoint]) print 'load weights' model.load_weights(bst_model_path) pred = model.predict(test) pd.to_pickle(pred, pred_path) def get_ans(pred=pred, idd=np.random.random(3000)): pred = pred.argsort(axis=1)[:, ::-1][:, :5] ll = label_map.classes_ ans = [[ll[item] for item in items] for items in pred] res = pd.DataFrame(ans) res.index = idd return res test_idx = get_testid() ans = get_ans(pred, test_idx) ans.to_csv(res_path, index=True, header=False)
from sqlalchemy import create_engine, MetaData, Table import numpy as np import json #from matplotlib import pyplot as plt import cPickle as pickle #from runner_nonbayesian import NonBayesianPlayer #from matplotlib import pyplot as plt import scipy.stats as scistats from multiprocessing import Pool import data_loader import copy import cPickle as pickle data_matrix, data_dict, features, items = data_loader.get_data() db_url = "mysql://*****:*****@gureckislab.org/mt_experiments" table_name = '20q_model_tester_exp2' data_column_name = 'datastring' # boilerplace sqlalchemy setup engine = create_engine(db_url) metadata = MetaData() metadata.bind = engine table = Table(table_name, metadata, autoload=True) # make a query and loop through s = table.select() rows = s.execute()
import sys from genetic_mapper import Mapper import data_loader if __name__ == "__main__": if (len(sys.argv) <= 1): print("Please enter path to frequency file!") else: freq_file = sys.argv[1] phenotypeFreq = data_loader.get_data(freq_file) print(phenotypeFreq) Mapper(phenotypeFreq).solve().print_solution()
model_result_filename_prefix = {"perceptron": "perceptron-", "slp": "neural-net-single-", "mlp": "neural-net-ml-", "lstm": "LSTM-pytorch-", "gru": "GRU-pytorch-"} output_file = open(output_dir_path + "metrics_interval.txt", "w") for model in ["perceptron", "slp", "mlp", "lstm", "gru"]: for file_size in ["56", "10K", "8M"]: trace_filename = "gcc-" + file_size + ".txt" trace_filepath = trace_dir_path + trace_filename results_filepath = results_dir_path + model_result_filename_prefix[model] + trace_filename trace_features, trace_labels = data_loader.get_data(data_path=trace_filepath, k=8) result_labels = result_data_loader.get_data(data_path=results_filepath) for metric in [accuracy_score, f1_score, mutual_info_score, confusion_matrix, matthews_corrcoef]: metric_output = interval_result(trace_labels, result_labels, metric, 4) output_file.write("***> " + model + " " + file_size + " " + metric.__name__ + ":\n") output_file.write(str(metric_output)) output_file.write("\n\n\n") output_file.close() # (m,n) baseline output_file_mn = open(output_dir_path + "metrics_interval_mn.txt", "w") for file_size in ["56", "10K", "8M"]:
''' Created on Nov 23, 2015 @author: alxcoh ''' import numpy as np import data_loader import scipy.stats as scistats from runner_clust import ClustPlayer from multiprocessing import Pool data_matrix, _, features, objects = data_loader.get_data() full_probability_matrix_goodmethod = [[ 0.70385265, 0.31086353, 0.18540700, 0.12579383, 0.06243025], [ 0.09839965, 0.27167072, 0.12949217, 0.08246418, 0.03858618], [ 0.09718542, 0.21443453, 0.37983041, 0.20084763, 0.09890584], [ 0.05635820, 0.11671840, 0.17166805, 0.31256788, 0.17611042], [ 0.04420408, 0.08631282, 0.13360238, 0.27832648, 0.62396730]] logfile = open("clustmaker_logfile.txt", 'w') def get_clusters_given_posterior(n_clusters, current_clustering, posterior = np.repeat(0.01, (1000)), knowledge = []): #probget = np.all(0.2, (n_clusters, 218, 5), dtype=np.float64) did_shift = True clustering = current_clustering[:] iterations = 0 while did_shift: iterations += 1 print "\n\n*****************\nIterations:", iterations, "\n", "Clustering:", clustering, "\n*********************\n\n\n" did_shift = False #curval = cluster_validity(clustering)
all_results = {} final_results = {} actual_results = {} first_time = True predictions = {} actuals = {} for metric_to_estimate in intersting_metrics: predictions[metric_to_estimate] = [] actuals[metric_to_estimate] = [] if future: # Use 2018 data to predict 2019 labels df, labels = get_data(2018, 2018, target=metric_to_estimate, future=future) else: # Use 2019 data to predict 2019 dlabels df, labels = get_data(2019, 2019, target=metric_to_estimate, future=future) # Extract names of players in a given year names = np.array(labels[['Name']].values.tolist()) names = names[:, 0] print(metric_to_estimate) # Extract data for that metric metric = np.array(labels[[metric_to_estimate]].values.tolist()) metric = metric[:, 0]
from tree_variational import * from runner_variational import * #from sqlalchemy import create_engine, MetaData, Table import numpy as np import json #from matplotlib import pyplot as plt import cPickle as pickle #from runner_nonbayesian import NonBayesianPlayer #from matplotlib import pyplot as plt import scipy.stats as scistats from multiprocessing import Pool import data_loader data_matrix, data_dict, features, objects = data_loader.get_data() fulldatafile = open(base_path + "src/analysis_files/datalogs/fulldata_correctorder.txt", 'w') ''' db_url = "mysql://*****:*****@gureckislab.org/mt_experiments" table_name = '20q_model_tester' data_column_name = 'datastring' # boilerplace sqlalchemy setup engine = create_engine(db_url) metadata = MetaData() metadata.bind = engine table = Table(table_name, metadata, autoload=True) # make a query and loop through s = table.select()