def sample_dataset(file, amount): """ Sample the given amount of data from the file. Args: file(str): File to be sampled. amount(int): Amount of data to be drawn from the file. """ # Load and convert each title to lowercase. data = readlines(file, delimiter="\t", lower=True) # Sample sample_data(file, data, amount=amount)
def prepare_data(df, outliers, inliers, seed, fixed_cont, labeled_data, n_oe, oe_path, doc2vec_model, **kwargs): print("Only use classes that are in inliers or outliers") df = df.where(df.target.isin( outliers+inliers)).dropna() # label data as inliers and outliers (for scoring) and whether # they have labels or not (semi-supervised) df = label_data(df, seed, labeled_data, outliers) if fixed_cont: df = sample_data(df, 1.0, fixed_cont, seed) print("Data after adjusting for fixed contamination:\n") print(df.groupby(['label', 'outlier_label']).size( ).reset_index().rename(columns={0: 'count'}), "\n") if n_oe: df_oe = get_outlier_data(oe_path, n_oe, seed=42) df_oe["vecs"] = doc2vec_model.vectorize(df_oe["text"]) df = df.append(df_oe) if -1 in df.label.unique() and df.label.value_counts()[-1] != df.shape[0]: if df[(df.label == 0) & (df.outlier_label == -1)].shape[0] == 0: print("Adding missing sample for labeled outlier") df.loc[((df.label == -1) & (df.outlier_label == -1) ).idxmax(), 'label'] = 0 print("Training data:\n", df.groupby(['label', 'outlier_label']).size( ).reset_index().rename(columns={0: 'count'}), "\n\n") return df
from sklearn.metrics import r2_score # Data Factory DF = Data_Factory_Base() dim = 3 batch_size = 40 train_num = 10000 train_data = DF.convex_1(dim=dim, num=3 * batch_size) x_train = train_data[:, :-1].astype('float32') y_train = train_data[:, -1].astype('float32') test_data = DF.convex_1(dim=dim, num=1 * batch_size) x_test = test_data[:, :-1].astype('float32') y_test = test_data[:, -1].astype('float32') # dataset 2d sampled_data = sample_data(train_data, sample_num=train_num) feed_data = tf.data.Dataset.from_tensor_slices(sampled_data).batch(batch_size) # GP kernel = tfp.math.psd_kernels.ExponentiatedQuadratic( amplitude=tf.Variable(1.0, dtype=np.float32, name="amplitude"), length_scale=tf.Variable(1.0, dtype=np.float32, name="length_scale"), ) # k(x, y) = amplitude**2 * exp(-||x - y||**2 / (2 * length_scale**2)) # equals to RBF gp = tfp.distributions.GaussianProcess(kernel) # define training process model_2d = ContEncoder(dest_dim=dim - 1, original_dim=dim) opt_2d = tf.keras.optimizers.Adam(learning_rate=2e-1) train_gp_loop_2d(
g_optim = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize( g_loss, var_list=g_vars) # End: Build model. ################################################################################ # Start session. sess = tf.Session() sess.run(tf.global_variables_initializer()) if not os.path.exists(log_dir): os.makedirs(log_dir) # train() for it in range(max_iter): x_batch, w_batch = sample_data(data_normed, data_raw_weights, batch_size) z_batch = get_sample_z(batch_size, noise_dim) for _ in range(5): _, d_logit_real_, d_logit_fake_, d_loss_, g_loss_ = sess.run( [d_optim, d_logit_real, d_logit_fake, d_loss, g_loss], feed_dict={ z: z_batch, x: x_batch }) for _ in range(1): _, d_logit_real_, d_logit_fake_, d_loss_, g_loss_ = sess.run( [g_optim, d_logit_real, d_logit_fake, d_loss, g_loss], feed_dict={ z: z_batch, x: x_batch
import config import parse_movies import utils import numpy as np import os from sklearn.naive_bayes import MultinomialNB #from sklearn.feature_extraction import CountVectorizer from sklearn.feature_extraction.text import CountVectorizer all_movies = list( parse_movies.load_all_movies(os.path.join(config.baseDir, config.data_file))) #sample the data to 6000 for each decade from 1930 to 2010 sampled_movies = utils.sample_data(all_movies, 6000) #split the data to train and test datasets train_data = [] test_data = [] flip = True for m in sampled_movies: if (flip): train_data.append(m) flip = False else: test_data.append(m) flip = True #=============================================== #4a Use sklearn library to train the data
# Start session. sess = tf.Session() sess.run(tf.global_variables_initializer()) if not os.path.exists(log_dir): os.makedirs(log_dir) # train() # Start clock to time execution in chunks of log_step steps. t0 = time.time() for step in range(max_step): ##### # BEGINNING OF TIMED SEGMENT x_batch_preup, w_batch_preup = sample_data( data_normed, data_raw_weights, batch_size) z_batch = get_sample_z(batch_size, noise_dim) # UPSAMPLE WITHIN BATCH. x_batch = x_batch_preup[:] if sampling == 'random': # Upsample then randomly select batch_size to use. #for x_, w_ in zip(x_batch_preup, w_batch_preup): # k = int(round(w_)) # for _ in range(k - 1): # x_batch.append(x_) #x_batch = np.reshape(x_batch, [-1, data_dim]) #x_batch = x_batch[np.random.choice(len(x_batch), batch_size)] for x_, w_ in zip(x_batch_preup, w_batch_preup): k = int(round(w_)) for _ in range(k - 1):
def train(): for i in range(train_iter): ts = time.time() print "[{}%]".format(i / float(train_iter) * 100) mini_batch = [] idxs = [] is_weight = [] old_q = [] _mini_batch, _idxs, _is_weight = utils.sample_data( suction_1_memory, mini_batch_size) mini_batch += _mini_batch idxs += _idxs is_weight += list(_is_weight) tmp = [idx - memory_capacity[0] - 1 for idx in _idxs] suction_1_sampled[tmp] += 1 _mini_batch, _idxs, _is_weight = utils.sample_data( suction_2_memory, mini_batch_size) mini_batch += _mini_batch idxs += _idxs is_weight += list(_is_weight) tmp = [idx - memory_capacity[1] - 1 for idx in _idxs] suction_2_sampled[tmp] += 1 _mini_batch, _idxs, _is_weight = utils.sample_data( gripper_memory, mini_batch_size) mini_batch += _mini_batch idxs += _idxs is_weight += list(_is_weight) tmp = [idx - memory_capacity[2] - 1 for idx in _idxs] gripper_sampled[tmp] += 1 for j in range(len(mini_batch)): color = cv2.imread(mini_batch[j].color) depth = np.load(mini_batch[j].depth) pixel_index = mini_batch[j].pixel_idx next_color = cv2.imread(mini_batch[j].next_color) next_depth = np.load(mini_batch[j].next_depth) action_str, rotate_idx = utils.get_action_info(pixel_index) old_q.append( trainer.forward(color, depth, action_str, False, rotate_idx, clear_grad=True)[0, pixel_index[1], pixel_index[2]]) reward = mini_batch[j].reward td_target = trainer.get_label_value(reward, next_color, next_depth, mini_batch[j].is_empty, pixel_index[0]) loss_ = trainer.backprop(color, depth, pixel_index, td_target, is_weight[j], mini_batch_size, j == 0, j == len(mini_batch) - 1) # Update priority for j in range(len(mini_batch)): color = cv2.imread(mini_batch[j].color) depth = np.load(mini_batch[j].depth) pixel_index = mini_batch[j].pixel_idx next_color = cv2.imread(mini_batch[j].next_color) next_depth = np.load(mini_batch[j].next_depth) reward = mini_batch[j].reward td_target = trainer.get_label_value(reward, next_color, next_depth, mini_batch[j].is_empty, pixel_index[0]) action_str, rotate_idx = utils.get_action_info(pixel_index) new_value = trainer.forward(color, depth, action_str, False, rotate_idx, clear_grad=True)[0, pixel_index[1], pixel_index[2]] if j / mini_batch_size == 0: suction_1_memory.update(idxs[j], td_target - new_value) elif j / mini_batch_size == 1: suction_2_memory.update(idxs[j], td_target - new_value) else: gripper_memory.update(idxs[j], td_target - new_value) #print "Q value: {} -> {}| TD target: {}".format(old_q[j], new_value, td_target) if (i + 1) % save_freq == 0: print "Save model" torch.save(trainer.behavior_net.state_dict(), save_root + "/{}.pth".format(i + 1)) color = cv2.imread(compare_color) depth = np.load(compare_depth) suck_1_prediction, suck_2_prediction, grasp_prediction = trainer.forward( color, depth, is_volatile=True) heatmaps, mixed_imgs = utils.save_heatmap_and_mixed( suck_1_prediction, suck_2_prediction, grasp_prediction, feat_path, mixed_path, color, i + 1) np.savetxt(save_root + "/suction_1_sampled.csv", suction_1_sampled, delimiter=",") np.savetxt(save_root + "/suction_2_sampled.csv", suction_2_sampled, delimiter=",") np.savetxt(save_root + "/gripper_sampled.csv", gripper_sampled, delimiter=",") if (i + 1) % copy_target_net == 0: trainer.target_net.load_state_dict( trainer.behavior_net.state_dict()) print "Took {} seconds".format(time.time() - ts)
def train(self, X, n_iter=1000, w0=None, rate=0.01, alpha=0.5, mu=1e-6, sample=False, n_samples=100, evidence=None, warm_starts=False, tol=1e-6, verbose=True): """ Perform SGD wrt the weights w * n_iter: Number of steps of SGD * w0: Initial value for weights w * rate: I.e. the SGD step size * alpha: Elastic net penalty mixing parameter (0=ridge, 1=lasso) * mu: Elastic net penalty * sample: Whether to sample or not * n_samples: Number of samples per SGD step * evidence: Ground truth to condition on * warm_starts: * tol: For testing for SGD convergence, i.e. stopping threshold """ self.X_train = X # Set up stuff N, M = X.shape print "=" * 80 print "Training marginals (!= 0.5):\t%s" % N print "Features:\t\t\t%s" % M print "=" * 80 Xt = X.transpose() Xt_abs = sparse_abs(Xt) if sparse.issparse(Xt) else np.abs(Xt) w0 = w0 if w0 is not None else np.ones(M) # Initialize training w = w0.copy() g = np.zeros(M) l = np.zeros(M) g_size = 0 # Gradient descent if verbose: print "Begin training for rate={}, mu={}".format(rate, mu) for step in range(n_iter): # Get the expected LF accuracy t, f = sample_data(X, w, n_samples=n_samples) if sample else exact_data( X, w, evidence) p_correct, n_pred = transform_sample_stats(Xt, t, f, Xt_abs) # Get the "empirical log odds"; NB: this assumes one is correct, clamp is for sampling... l = np.clip(log_odds(p_correct), -10, 10) # SGD step with normalization by the number of samples g0 = (n_pred * (w - l)) / np.sum(n_pred) # Momentum term for faster training g = 0.95 * g0 + 0.05 * g # Check for convergence wn = np.linalg.norm(w, ord=2) g_size = np.linalg.norm(g, ord=2) if step % 250 == 0 and verbose: print "\tLearning epoch = {}\tGradient mag. = {:.6f}".format( step, g_size) if (wn < 1e-12 or g_size / wn < tol) and step >= 10: if verbose: print "SGD converged for mu={} after {} steps".format( mu, step) break # Update weights w -= rate * g # Apply elastic net penalty w_bias = w[-1] soft = np.abs(w) - mu ridge_pen = (1 + (1 - alpha) * mu) # \ell_1 penalty by soft thresholding | \ell_2 penalty w = (np.sign(w) * np.select([soft > 0], [soft], default=0)) / ridge_pen # Don't regularize the bias term if self.bias_term: w[-1] = w_bias # SGD did not converge else: if verbose: print "Final gradient magnitude for rate={}, mu={}: {:.3f}".format( rate, mu, g_size) # Return learned weights self.w = w
#============================================== # 2d. PMF of P(Y|X"the">0) #============================================== pmf, data_year = cal_pmf(all_movies, 'the') n = len(data_year) x = [] y = [] for year, amount in pmf.iteritems(): x.append(year) y.append(float(amount) / float(n)) utils.histogram(x, y, 'Decade', 'PMF', 'Blananced PMF of P(Y|X"the">0)') print 'Blananced PMF of P(Y|X"the">0) done' #sample the data to 6000 for each decade from 1930 to 2010 sampled_movies = utils.sample_data(all_movies, 6000, (1930, 2010)) #============================================== # 2e. PMF of P(Y|X"radio">0) #============================================== pmf, data_year = cal_pmf(sampled_movies, 'radio') n = len(data_year) x = [] y = [] for year, amount in pmf.iteritems(): x.append(year) y.append(float(amount) / float(n)) utils.histogram(x, y, 'Decade', 'PMF', 'Blananced PMF of P(Y|X"radio">0)') print 'Blananced PMF of P(Y|X"radio">0) done' #==============================================
import config import parse_movies import utils import numpy as np import os from sklearn.naive_bayes import MultinomialNB #from sklearn.feature_extraction import CountVectorizer from sklearn.feature_extraction.text import CountVectorizer all_movies = list(parse_movies.load_all_movies(os.path.join(config.baseDir,config.data_file))) #sample the data to 6000 for each decade from 1930 to 2010 sampled_movies = utils.sample_data(all_movies, 6000) #split the data to train and test datasets train_data = [] test_data = [] flip = True for m in sampled_movies: if (flip): train_data.append(m) flip = False else: test_data.append(m) flip = True #=============================================== #4a Use sklearn library to train the data #for everyiterm in training data, find the bag of word, convert to feature vector, correspond to year sumList = []
# Jump to the previous cycle for restart. if epoch == initial_epoch: if cycle < initial_cycle: continue retval_list = mp.Manager().list() # Is this needed? # List of multiprocessing.managers.ListProxy to collect losses retval_list = [mp.Manager().list() for i in range(args.ncpus)] st = time.time() processes = [] for pid in range(args.ncpus): # Sample keys without considering activeness. if args.active_ratio is None: keys = random.sample(id_to_smiles.keys(), args.item_per_cycle) # Sample active and inactive keys by the required ratio. else: keys = utils.sample_data(id_to_whole_conditions, args.item_per_cycle, args.active_ratio) # Property (descriptor) values work as conditions. # We need both of whole and scaffold values. # whole_conditions := [ # [ value1, value2, ... ], # condition values of whole 1 # [ value1, value2, ... ], # condition values of whole 2 # ... ] whole_conditions = [id_to_whole_conditions[key] for key in keys] scaffold_conditions = [id_to_scaffold_conditions[key] for key in keys] # SMILESs of whole molecules and scaffolds. wholes = [id_to_smiles[key][0] for key in keys] scaffolds = [id_to_smiles[key][1] for key in keys] proc = mp.Process(target=train, args=(shared_model, shared_optimizer, wholes, scaffolds, whole_conditions, scaffold_conditions, pid, retval_list, args))
oe_path=oe_path, doc2vec_model=doc2vec_model, **params) # combine if params["weakly_supervised"]: df = df.append(df_weakly).reset_index(drop=True) # label test set df_test["label"] = 0 df_test.loc[~df_test.target.isin(params["test_outliers"]), "label"] = 1 df_test["outlier_label"] = -1 df_test.loc[~df_test.target.isin(params["test_outliers"]), "outlier_label"] = 1 # sampling the df_test set df_test = sample_data(df_test, 1.0, 0.1, 42) df_test = df_test[df_test.target.isin(params["test_outliers"] + params["test_inliers"])] print("df_train") print(df.label.value_counts()) print(df.target.value_counts()) print("df_test") print(df_test.label.value_counts()) print(df_test.target.value_counts()) ##### # train ##### # UMAP Train
if suction_1_memory_buffer.length > mini_batch_size and \ suction_2_memory_buffer.length > mini_batch_size and \ gripper_memory_buffer.length > mini_batch_size: sufficient_exp += 1 if (sufficient_exp - 1) % learning_freq == 0: back_ts = time.time() if arduino: arduino.write("b 1000") learned_times += 1 mini_batch = [] idxs = [] is_weight = [] old_q = [] td_target_list = [] if specific_tool is not None: if specific_tool == 0: mini_batch, idxs, is_weight = utils.sample_data( suction_1_memory_buffer, mini_batch_size) elif specific_tool == 1: mini_batch, idxs, is_weight = utils.sample_data( suction_2_memory_buffer, mini_batch_size) elif specific_tool == 2: mini_batch, idxs, is_weight = utils.sample_data( gripper_memory_buffer, mini_batch_size) else: _mini_batch, _idxs, _is_weight = utils.sample_data( suction_1_memory_buffer, mini_batch_size) mini_batch += _mini_batch idxs += _idxs is_weight += list(_is_weight) _mini_batch, _idxs, _is_weight = utils.sample_data( suction_2_memory_buffer, mini_batch_size) mini_batch += _mini_batch