def load_datasets(self, target, data_dir, batch_dir, batch_image_size, stride_size=0): print("Loading datasets for [%s]..." % target) util.make_dir(batch_dir) if stride_size == 0: stride_size = batch_image_size // 2 if self.bicubic_init: resampling_method = "bicubic" else: resampling_method = "nearest" datasets = util.DataSets(self.scale, batch_image_size, stride_size, channels=self.channels, jpeg_mode=self.jpeg_mode, max_value=self.max_value, resampling_method=resampling_method) if not datasets.is_batch_exist(batch_dir): datasets.build_batch(data_dir, batch_dir) if target == "training": datasets.load_batch_train(batch_dir) self.train = datasets else: datasets.load_batch_test(batch_dir) self.test = datasets
def main(): if len(sys.argv) != 7: usage() pred_dir = sys.argv[1] helper.check_dir_exist(pred_dir) true_segment_dir = sys.argv[2] helper.check_dir_exist(true_segment_dir) ct = sys.argv[3] outDir = sys.argv[4] helper.make_dir(outDir) num_chromHMM_state = helper.get_command_line_integer(sys.argv[5]) num_score_bins = helper.get_command_line_integer(sys.argv[6]) print "Done getting command line arguments" # first get the upper bounds for the score bins (reverse_lower_bound_list, upper_bound_score_list) = get_score_bins(num_score_bins) print "Get the bounds of posterior probabilities that we will set for each of the bin" # get the count of true positives and false positives, etc. across all regions in the genome total_tp_fp_df = get_tp_fp_data_all_regions(true_segment_dir, pred_dir, reverse_lower_bound_list, ct, num_chromHMM_state) print "Done processing all the files corresponding to all the regions in the genome" # calculate tpr and fpr values for each of the state save_fn = os.path.join(outDir, 'tpr_fpr_all_states.txt.gz') calculate_tpr_fpr(total_tp_fp_df, num_chromHMM_state, save_fn) print "Done calculating true positive rates and false positive rates in all bins"
def call_cross_validation_functions(validate_ct, ct_list, outDir, train_sampled_data_fn, all_ct_posterior_folder, num_chromHMM_state): code_file_fn = '/u/home/h/havu73/project-ernst/source_pete/train_and_evaluate/posterior_based/train_predict_chromHMM_posterior.py' val_outDir = os.path.join(outDir, 'val_' + validate_ct) print "Running validation of ct: " + validate_ct for train_ct_i, train_ct in enumerate( ct_list ): # this ct will be used be the response variable for training the data. predictor_ct_list = ct_list[:train_ct_i] + ct_list[( train_ct_i + 1 ):] # leave out the response ct. All the remaining ones will be used as predictor and will be passed into the program num_predictor_ct = len(predictor_ct_list) this_predict_outDir = os.path.join(val_outDir, 'pred_' + train_ct) helper.make_dir(this_predict_outDir) command = [ 'python', code_file_fn, train_sampled_data_fn, all_ct_posterior_folder, this_predict_outDir, train_ct, str(num_chromHMM_state), str(num_predictor_ct) ] + predictor_ct_list print "Within, running predicting cell type: " + train_ct # call(command) print "Averaging results from different predictions for this validation" averaging_predictions_to_validate_one_ct(validate_ct_dir=val_outDir, validate_ct=validate_ct, num_pred_ct=len(ct_list)) print "" print ""
def main(): if len(sys.argv) != 3: usage() chrom_dir = sys.argv[1] helper.check_dir_exist(chrom_dir) out_dir = sys.argv[2] helper.make_dir(out_dir) print "Done getting command line arguments"
def main(): if len(sys.argv) != 4: usage() avg_state_dir = sys.argv[1] helper.check_dir_exist(avg_state_dir) out_dir = sys.argv[2] helper.make_dir(out_dir) num_chromHMM_state = helper.get_command_line_integer(sys.argv[3]) print "Done getting command line arguments" calculate_hist_parallel(avg_state_dir, out_dir, num_chromHMM_state)
def main(): if len(sys.argv) != 4: usage() all_ct_hist_dir = sys.argv[1] helper.check_dir_exist(all_ct_hist_dir) out_dir = sys.argv[2] helper.make_dir(out_dir) num_chromHMM_state = helper.get_command_line_integer(sys.argv[3]) print "Done getting command line arguments" average_histogram_across_all_ct(all_ct_hist_dir, out_dir, num_chromHMM_state) print "Done!"
def main(): if len(sys.argv) != 7: usage() ct_pos_dir = sys.argv[1] helper.check_dir_exist(ct_pos_dir) out_dir = sys.argv[2] helper.make_dir(out_dir) num_chromHMM_state = helper.get_command_line_integer(sys.argv[3]) ct_name = sys.argv[4] prefix_pos_fn = sys.argv[5] suffix_pos_fn = sys.argv[6] print "Done getting command line arguments" calculate_hist_parallel(ct_pos_dir, out_dir, num_chromHMM_state, ct_name, prefix_pos_fn, suffix_pos_fn)
def main(): if len(sys.argv) != 6: usage() cg_dir = sys.argv[1] helper.check_dir_exist(cg_dir) out_dir = sys.argv[2] helper.make_dir(out_dir) num_chromHMM_model = helper.get_command_line_integer(sys.argv[3]) num_score_bins = helper.get_command_line_integer(sys.argv[4]) cell_type_list_fn = sys.argv[5] ct_list = helper.get_list_from_line_seperated_file(cell_type_list_fn) helper.check_file_exist(cell_type_list_fn) print "Done getting command line arguments" calculate_summary_staistics_across_ct(cg_dir, out_dir, num_chromHMM_model, num_score_bins, ct_list) print "Done!"
def main(): num_mandatory_args = 8 if len(sys.argv) < num_mandatory_args: usage() train_segment_fn = sys.argv[1] helper.check_file_exist(train_segment_fn) all_ct_segment_folder = sys.argv[ 2] # where the segmentation data of all cell types are combined, and stored in files corresponding to different regions in the genome. if not os.path.isdir(all_ct_segment_folder): print "all_ct_segment_folder IS NOT VALID: " + all_ct_segment_folder usage() predict_outDir = sys.argv[3] helper.make_dir(predict_outDir) response_ct = sys.argv[4] try: num_chromHMM_state = int(sys.argv[5]) assert num_chromHMM_state > 0, "num_chromHMM_state needs to be positive" num_train_ct = int(sys.argv[6]) assert num_train_ct > 0, "num_train_ct needs to be positive" except: print "num_chromHMM_state or num_train_ct is not valid" usage() train_mode = sys.argv[7] if len(sys.argv) != (num_train_ct + num_mandatory_args): print "num_train_ct is different from the number of arguments passed into the program" usage() print "Done getting command line arguments" train_cell_types = sys.argv[ num_mandatory_args:] # the rest of the arguments are the cell types that we use to train the model # 1. Get the data of predictors and response for training Xtrain_segment_df, Y_df = get_XY_segmentation_data(train_cell_types, response_ct, num_chromHMM_state, train_segment_fn, train_mode) print "Done getting one hot data" print Xtrain_segment_df.head() print print Y_df.head() # 2. Get the regression machine regression_machine = train_model(Xtrain_segment_df, Y_df, num_chromHMM_state, train_mode) print "Done training" # 3. Based on the machine just created, process training data and then predict the segmentation at each position for the response_ct predict_segmentation(all_ct_segment_folder, regression_machine, predict_outDir, train_cell_types, response_ct, num_chromHMM_state, train_mode) print "Done predicting whole genome"
def main(): if len(sys.argv) != 3: usage() org_ct_segment_folder = sys.argv[1] if not os.path.isdir(org_ct_segment_folder): print "org_ct_segment_folder IS NOT VALID: " + org_ct_segment_folder usage() output_folder = sys.argv[2] helper.make_dir(output_folder) print "Done getting command line arguments" ct_list, ct_df_list = get_ct_segment_df (org_ct_segment_folder) print "Done getting segment_df for all cell types" chrom_len_dict = get_chromosome_length(org_ct_segment_folder, ct_list) print "Done getting chromosome length" combine_segment_in_parallel(ct_list, ct_df_list, chrom_len_dict, output_folder) print "" print "" print "Done!"
def main(): num_mandatory_args = 7 if len(sys.argv) < num_mandatory_args: usage() train_segment_fn = sys.argv[1] helper.check_file_exist(train_segment_fn) all_ct_posterior_folder = sys.argv[ 2] # where the segmentation data of all cell types are combined, and stored in files corresponding to different regions in the genome. helper.check_dir_exist(all_ct_posterior_folder) predict_outDir = sys.argv[3] helper.make_dir(predict_outDir) response_ct = sys.argv[4] try: num_chromHMM_state = int(sys.argv[5]) assert num_chromHMM_state > 0, "num_chromHMM_state needs to be positive" num_train_ct = int(sys.argv[6]) assert num_train_ct > 0, "num_train_ct needs to be positive" except: print "num_chromHMM_state or num_train_ct is not valid" usage() if len(sys.argv) != (num_train_ct + num_mandatory_args): print "num_train_ct is different from the number of arguments passed into the program" usage() print "Done getting command line arguments" train_cell_types = sys.argv[ num_mandatory_args:] # the rest of the arguments are the cell types that we use to train the model # 1. Get the data of predictors and response for training Xtrain_segment_df, Y_df = get_XY_segmentation_data( train_cell_types, response_ct, num_chromHMM_state, train_segment_fn ) # Xtrain_segment_df: example colnames: 'E047_S16', 'E047_S17' --> posterior probabilities of each of the state in each cell type that are used to train # Y_df --> example colnames 'E047' --> state numbers 1 --> 18 of each position used to train data for the response cell type print "Done getting one hot data" print Xtrain_segment_df.head() print print Y_df.head() # 2. Get the regression machine regression_machine = train_multinomial_logistic_regression( Xtrain_segment_df, Y_df, num_chromHMM_state) print "Done training" # 3. Based on the machine just created, process training data and then predict the segmentation at each position for the response_ct predict_segmentation(all_ct_posterior_folder, regression_machine, predict_outDir, train_cell_types, response_ct, num_chromHMM_state) print "Done predicting whole genome"
def main(): if len(sys.argv) != 7: usage() train_sampled_data_fn = sys.argv[1] helper.check_file_exist(train_sampled_data_fn) outDir = sys.argv[2] helper.make_dir(outDir) all_ct_posterior_folder = sys.argv[3] helper.check_dir_exist(all_ct_posterior_folder) num_chromHMM_state = helper.get_command_line_integer(sys.argv[4]) validate_ct = sys.argv[5] all_ct_list_fn = sys.argv[6] print "Done getting command line arguments" # get all cell types ct_list = get_all_train_ct_list(all_ct_list_fn, validate_ct) print ct_list # call all cell types call_cross_validation_functions(validate_ct, ct_list, outDir, train_sampled_data_fn, all_ct_posterior_folder, num_chromHMM_state)
def histogram_movie(data_loc, resolution, plot_loc): """ A function that returns a directory of images depicting the probability density (histogram) of the positions for each time step that can be made into a movie. data_loc : directory where the simulated data is located resolution : number of bins for the histogram plot_loc : directory where the plots will be placed """ # making a list of all the files file_list = glob.glob(f"{data_loc}/experiment*") # importing all of the data from the experiments print("Importing data...") all_data = np.array([np.load(file) for file in file_list]) # extracting time series (assumes common time scaling across exps) ts = all_data[0][0] # extracting all position datadata_lo pos_data = np.array([all_data[i][1] for i in range(len(all_data))]) print("Producing plots...") # making histogram plots # creating a folder to save the plots make_dir(plot_loc) for i in range(len(ts)): print(f"\r{i}/{len(ts)}", end="") plt.clf() # clear figure plt.xlim(-1.5, 1.5) # setting common x axis # we are taking the histogram across experiments # for each timestep, hence the transposing plt.hist(pos_data.T[i], bins=resolution, range=(-1.0, 1.0)) # plotting histogram plt.title(f"Time : {ts[i]} units") # keeping track of time plt.savefig(f"./{plot_loc}/step-{i:05n}.png") print("\nPlot production complete!")
def main(): if len(sys.argv) != 8: usage() train_sampled_data_fn = sys.argv[1] helper.check_file_exist(train_sampled_data_fn) outDir = sys.argv[2] helper.make_dir(outDir) all_ct_segment_folder = sys.argv[3] helper.check_dir_exist(all_ct_segment_folder) num_chromHMM_state = helper.get_command_line_integer(sys.argv[4]) validate_ct = sys.argv[5] train_mode = sys.argv[6] all_ct_list_fn = sys.argv[7] print "Done getting command line arguments" # get the list of all genomic positions used to segment the genome for our model training (we exclude chromosome Y in all analysis) gen_pos_list = get_genomic_positions_list(all_ct_segment_folder) # get all cell types ct_list = get_all_train_ct_list(all_ct_list_fn, validate_ct) # call all cell types call_cross_validation_functions(validate_ct, ct_list, outDir, train_sampled_data_fn, all_ct_segment_folder, num_chromHMM_state, gen_pos_list, train_mode)
def averaging_predictions_to_validate_one_ct(validate_ct_dir, validate_ct, num_pred_ct, gen_pos_list): # num_pred_ct: number of ct whose predictions we use to average out and get the predictions for the validate cell type # validate_ct_dir = the directory where the data that are associated with the ct used for validation cell type pred_dir_list = glob.glob(validate_ct_dir + "/pred_*") pred_ct_list = map(lambda x: (x.split('/')[-1]).split('_')[-1], pred_dir_list) # path/to/pred_E034 --> E034 assert len( pred_dir_list ) == num_pred_ct, 'Number of pred_dir_list is not the same as number of specificed ct used to predict the model' # get the folder where the results of averaging across different prediction cell types will be stored validate_outDir = os.path.join(validate_ct_dir, 'average_predictions') helper.make_dir(validate_outDir) for gene_window in gen_pos_list: # loop through each genomic window and then get the avrage result across different predictions for all positions in this window this_window_output_fn = os.path.join(validate_outDir, gene_window + "_avg_pred.txt.gz") this_window_pred_fn_list = map( lambda x: os.path.join(x, gene_window + "_pred_out.txt.gz"), pred_dir_list) # calculate the average prediction results across different prediction cell types for this window, and save the results average_multiple_result_files(this_window_pred_fn_list, this_window_output_fn) return
def averaging_predictions_to_validate_one_ct(validate_ct_dir, validate_ct, num_pred_ct): # num_pred_ct: number of ct whose predictions we use to average out and get the predictions for the validate cell type # validate_ct_dir = the directory where the data that are associated with the ct used for validation cell type pred_dir_list = glob.glob(validate_ct_dir + "/pred_*") pred_ct_list = map(lambda x: (x.split('/')[-1]).split('_')[-1], pred_dir_list) # path/to/pred_E034 --> E034 assert len( pred_dir_list ) == num_pred_ct, 'Number of pred_dir_list is not the same as number of specificed ct used to predict the model' # get the folder where the results of averaging across different prediction cell types will be stored validate_outDir = os.path.join(validate_ct_dir, 'average_predictions') helper.make_dir(validate_outDir) for chrom_index in helper.CHROMOSOME_LIST: # loop through each genomic window and then get the avrage result across different predictions for all positions in this window this_window_pred_fn_list = map( lambda x: os.path.join( x, 'chr' + str(chrom_index) + "_pred_out.txt.gz"), pred_dir_list) # calculate the average prediction results across different prediction cell types for this window, and save the results average_multiple_result_files( this_window_pred_fn_list, chrom_index, validate_outDir ) # this function will take the average of predictions of multiple cell types, for each genomic bin(200bp), and then divide the averaged data for each chromosome into print "Done averaging results for chromosome: " + str(chrom_index) return
def __init__(self, flags, model_name=""): # Model Parameters self.filters = flags.filters self.min_filters = flags.min_filters self.nin_filters = flags.nin_filters self.nin_filters2 = flags.nin_filters2 if flags.nin_filters2 != 0 else flags.nin_filters // 2 self.cnn_size = flags.cnn_size self.last_cnn_size = flags.last_cnn_size self.cnn_stride = 1 self.layers = flags.layers self.nin = flags.nin self.bicubic_init = flags.bicubic_init self.dropout = flags.dropout self.activator = flags.activator self.filters_decay_gamma = flags.filters_decay_gamma # Training Parameters self.initializer = flags.initializer self.weight_dev = flags.weight_dev self.l2_decay = flags.l2_decay self.optimizer = flags.optimizer self.beta1 = flags.beta1 self.beta2 = flags.beta2 self.momentum = flags.momentum self.batch_num = flags.batch_num self.batch_image_size = flags.batch_image_size if flags.stride_size == 0: self.stride_size = flags.batch_image_size // 2 else: self.stride_size = flags.stride_size # Learning Rate Control for Training self.initial_lr = flags.initial_lr self.lr_decay = flags.lr_decay self.lr_decay_epoch = flags.lr_decay_epoch # Dataset or Others self.dataset = flags.dataset self.test_dataset = flags.test_dataset # Image Processing Parameters self.scale = flags.scale self.max_value = flags.max_value self.channels = flags.channels self.jpeg_mode = flags.jpeg_mode self.output_channels = self.scale * self.scale # Environment (all directory name should not contain '/' after ) self.checkpoint_dir = flags.checkpoint_dir self.tf_log_dir = flags.tf_log_dir # Debugging or Logging self.debug = flags.debug self.save_loss = flags.save_loss self.save_weights = flags.save_weights self.save_images = flags.save_images self.save_images_num = flags.save_images_num self.log_weight_image_num = 32 # initialize variables self.name = self.get_model_name(model_name) self.batch_input = self.batch_num * [None] self.batch_input_quad = np.zeros(shape=[ self.batch_num, self.batch_image_size, self.batch_image_size, self.scale * self.scale ]) self.batch_true_quad = np.zeros(shape=[ self.batch_num, self.batch_image_size, self.batch_image_size, self.scale * self.scale ]) self.receptive_fields = 2 * self.layers + self.cnn_size - 2 self.complexity = 0 # initialize environment util.make_dir(self.checkpoint_dir) util.make_dir(flags.graph_dir) util.make_dir(self.tf_log_dir) if flags.initialise_tf_log: util.clean_dir(self.tf_log_dir) util.set_logging(flags.log_filename, stream_log_level=logging.INFO, file_log_level=logging.INFO, tf_log_level=tf.logging.WARN) config = tf.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf.InteractiveSession(config=config) self.init_train_step() logging.info("\nDCSCN -------------------------------------") logging.info("%s [%s]" % (util.get_now_date(), self.name))
def main(net_config, ckpt_for_init): ## load the config config = configs.Config(net_config) ## set the logger test_dir = os.path.join(config.log_dir, "test") log_dir = helper.make_dir([test_dir], re_create_dir = True) log_file = os.path.join(log_dir, config.net_config + '_test.txt') csv_file = os.path.join(log_dir, config.net_config + '_test.csv') logger = helper.Logger(log_file) logger.add(config.config_str, do_print=True) ## load the dasets from the csv file (train, val, feat_len) data = input_data.load_datasets(config.input_csv) # data has train.next_batch(xx) test.images. test.labels feat_len = data.feat_len ## set the input placeholders layer = 'input' with tf.name_scope(layer) as scope: x = tf.placeholder(tf.float32, [None, feat_len], name='input') y = tf.placeholder(tf.float32, [None, 1], name = 'output') keep_prob = tf.constant(1.0, name = 'keep_prob') ## call inference and compute the output y_ = deepnets.inference(config, input_tensors = {"x": x, "keep_prob": keep_prob}) ## set the global step global_step = tf_utils.get_global_step() ## tensors to compute the validatoin loss with tf.name_scope('validation') as scope: val_loss = loss.compute_loss(est=y_, gt=y, loss_func= config.test_loss) val_summary = tf.summary.scalar('val_loss', val_loss) init_op = tf.initialize_all_variables() sess = tf.Session() sess.run(init_op) ## saving and restoring operations restore_variables = tf_utils.get_model_varaibles() +\ tf.get_collection("GLOBAL_STEP")+\ tf.get_collection('BN_VARIABLES') saver = tf.train.Saver(restore_variables) step_init = tf_utils.restore_model(config, sess, restore_variables, ckpt_for_init, logger) summary_writer = tf.summary.FileWriter(log_dir, sess.graph) # do the validation features = np.concatenate((data.train.features, data.val.features), axis=0) output= np.concatenate((data.train.output, data.val.output), axis=0) feed = {x:features, y: output} est, v_loss, v_summary = sess.run([y_, val_loss, val_summary], feed_dict=feed) # input_headers = [x.encode('latin1') for x in data.input_header] headers = ','.join(data.input_header) + ", gt-y, est-y" vals = np.concatenate((features, output, est), axis=1) # append dataset default mu and sigma for estimated values mu = np.append(data.mu, data.mu[-1]) sigma = np.append(data.sigma, data.sigma[-1]) # reverse the standardization operation to the vals vals = np.add(vals * sigma, mu) np.savetxt(csv_file, vals, header= headers, delimiter=",") summary_writer.add_summary(v_summary, step_init) logger.add('val_loss {:f}'.format(v_loss), do_print=True) logger.save()
def signal_ensemble(data_loc, resolution, plot_loc): """ A function that returns heatmap of the position distributions data_loc : directory where the simulated data is located resolution : number of bins for the histogram plot_loc : directory where the plots will be placed """ # making a list of all the files file_list = glob.glob(f"{data_loc}/experiment*") # importing all of the data from the experiments print("Importing data...") all_data = np.array([np.load(file) for file in file_list]) # extracting time series (assumes common time scaling across exps) ts = all_data[0][0] # extracting all position datadata_lo pos_data = np.array([all_data[i][1] for i in range(len(all_data))]) print("Producing plots...") # making histogram plots # creating a folder to save the plots make_dir(plot_loc) # range of histogram will be experiment agnostic and is determined from the data directly amplitude = max(abs(pos_data[0])) * 1.5 # for starters ensemble_histogram = np.array([ np.histogram(pos_data.T[i], bins=resolution, range=(-amplitude, amplitude), weights=np.ones_like(pos_data.T[i]) / len(pos_data.T[i]))[0] for i in range(len(ts)) ]) # A 2D matrix where each row is a histogram of position for each timestep # displaying the plot # using figures and subplots fig, ax = plt.subplots(1) fig.set_figheight(6) fig.set_figwidth(12) hist = ax.pcolor( ensemble_histogram.T, cmap='inferno') # such that x-axis is time and y-axis is position # adding colorbar fig.colorbar(hist, ax=ax) # relabeling axes # making the ticks correct # firstly, the xticks x_t_pos = np.linspace(0, len(ts), 10) # we are sticking to just 10 ticks x_t_labels = [f"{t:.2f}" for t in ts[::len(ts) // 10] ] # choosing the right time values ax.set_xticks(x_t_pos) ax.set_xticklabels(x_t_labels, fontsize=12) # now, the yticks y_t_pos = np.linspace(0, resolution, 10) y_t_labels = [f"{x:.2E}" for x in np.linspace(-amplitude, amplitude, 10)] ax.set_yticks(y_t_pos) ax.set_yticklabels(y_t_labels, fontsize=12) ax.set_xlabel("Time [s]", fontsize=16) ax.set_ylabel("Position [m]", fontsize=16) ax.set_title("Probability distribution of nanosphere across time", fontsize=18) plt.savefig(f"{plot_loc}/signal_histogram.png", bbox_inches='tight') plt.show()
PLOT_DIR = "plot" if timestep is None: timestep = 1e-3 if label is None: # if a label is not provided, then use default naming conventions DATA_DIR += f"_n{NUM_TRIALS}_{timestamp}" PLOT_DIR += f"_n{NUM_TRIALS}_{timestamp}" else: # if a label is provided, then use label for directory naming DATA_DIR += f"_{label}" PLOT_DIR += f"_{label}" make_dir(DATA_DIR) # making the directory where the data will end up print("Running Simulations!") # first, running simulations print(f"Maximum simulation time\t:{max_time}\nDamping (gamma)\t:{gamma}\nTemperature (kB T)\t:{kBT}\nSaving frequency\t:{saving_freq} steps per save") # doing this multiple times so as to generate an average for trial_num in range(NUM_TRIALS): print(f"\r{trial_num}/{NUM_TRIALS}",end="") ts,xs,vs,ks,ps = simulation.trapSolver([simulation.var_stiffness,mass,max_time, gamma, kBT], timestep, saving_freq) # save this data simulation.save_data([ts,xs,vs,ks,ps],DIR_NAME=DATA_DIR,file_index=trial_num) print("\nSimulations are complete") # running statistics, particularly the histogram import statistics