def train_from_scratch(config, state, channel): # Model options save_model_dir = config[config.model].save_model_dir if save_model_dir == 'current': config[config.model].save_model_dir = './' save_model_dir = './' # to facilitate the use of cluster for multiple jobs save_path = './model_config.pkl' else: # run locally, save locally save_path = save_model_dir + 'model_config.pkl' print 'current save dir ',save_model_dir utils.create_dir_if_not_exist(save_model_dir) reload_ = config[config.model].reload_ if reload_: print 'preparing reload' save_dir_backup = config[config.model].save_model_dir from_dir_backup = config[config.model].from_dir # never start retrain in the same folder assert save_dir_backup != from_dir_backup print 'save dir ',save_dir_backup print 'from_dir ',from_dir_backup print 'setting current model config with the old one' model_config_old = utils.load_pkl(from_dir_backup + '/model_config.pkl') set_config(config, model_config_old) config[config.model].save_model_dir = save_dir_backup config[config.model].from_dir = from_dir_backup config[config.model].reload_ = True if config.erase_history: print 'erasing everything in ',save_model_dir os.system('rm %s/*'%save_model_dir) # for stdout file logging #sys.stdout = Unbuffered(sys.stdout, state.save_model_path + 'stdout.log') print 'saving model config into %s'%save_path utils.dump_pkl(config, save_path) # Also copy back from config into state. for key in config: setattr(state, key, config[key]) model_type = config.model print 'Model Type: %s'%model_type print 'Dataset: %s'%config[config.model].dataset print 'Command: %s' % ' '.join(sys.argv) if config.model == 'attention': model_deepRNN.train_from_scratch(state, channel) else: raise NotImplementedError()
def train_util(params): save_dir = params['save_dir'] print('current save dir : ' + save_dir) utils.create_dir_if_not_exist(save_dir) reload_model = params['reload_model'] if reload_model: print 'preparing reload' save_dir_backup = params['save_dir'] from_dir_backup = params['from_dir'] # never start retrain in the same folder assert save_dir_backup != from_dir_backup print 'save dir ', save_dir_backup print 'from_dir ', from_dir_backup print 'setting current model config with the old one' model_config_old = utils.read_from_json(from_dir_backup + 'model_config.json') model_config_old['reload_model'] = True model_config_old['save_dir'] = params['save_dir'] model_config_old['from_dir'] = params['from_dir'] model_config_old['max_epochs'] = params['max_epochs'] model_config_old['dispFreq'] = params['dispFreq'] model_config_old['sampleFreq'] = params['sampleFreq'] model_config_old['validFreq'] = params['validFreq'] model_config_old['debug'] = params['debug'] params = model_config_old feats_dir = params['feats_dir'] elif params['cnn_name'] != "MURALI": feats_dir = params['feats_dir'] + params['cnn_name'] + "_kmeans3/" else: feats_dir = params['feats_dir'] print('feats dir : ' + feats_dir) params['feats_dir'] = feats_dir config_save_path = save_dir + "model_config.json" print('saving model config into %s' % config_save_path) utils.write_to_json(params, config_save_path) t0 = time.time() print('training an attention model') train(params, **params) print('training time in total %.4f sec' % (time.time() - t0))
def frames_to_feat(cnn, vid_ids_path, num_vids): if cnn == "ResNet50": model, height, width, preprocess_input = get_ResNet50_model() FEAT_DIM = config.RESNET_FEAT_DIM elif cnn == "ResNet152": model, height, width, preprocess_input = get_ResNet152_model() FEAT_DIM = config.RESNET_FEAT_DIM elif cnn == "InceptionV3": model, height, width, preprocess_input = get_InceptionV3_model() FEAT_DIM = config.INCEPTION_FEAT_DIM elif cnn == "VGG19": model, height, width, preprocess_input = get_VGG19_model() FEAT_DIM = config.VGG_FEAT_DIM else: raise NotImplementedError() feat_save_path = config.MSVD_FEATS_DIR + cnn + "/" print "saving feats to :", feat_save_path utils.create_dir_if_not_exist(feat_save_path) vid_ids = utils.read_file_to_list(vid_ids_path) vid_clips_list = [vid[:-4] for vid in vid_ids] assert len(vid_ids) == num_vids for vid in vid_clips_list: print("extracting features from : " + vid) vid_frames_dir = config.MSVD_FRAMES_DIR + "/" + vid frames_list = utils.read_dir(vid_frames_dir) n_frames = len(frames_list) if n_frames > config.MAX_FRAMES: n_frames = config.MAX_FRAMES selected_frames = extract_frames_equally_spaced( n_frames, config.FRAME_SPACING) vid_feats = np.empty((0, FEAT_DIM), dtype=np.float32) for fid in selected_frames: img_path = vid_frames_dir + "/frame" + str(fid) + ".jpg" # print("extracting features from : "+img_path) img_feat = img_to_feat(img_path, height, width, preprocess_input, model) vid_feats = np.vstack((vid_feats, img_feat)) print(vid_feats.shape) np.save(feat_save_path + vid + ".npy", vid_feats)
def feats_kmeans(cnn, vid_ids_path, num_vids, org_dim, k): feat_save_path = config.MSVD_FEATS_DIR + cnn + "_kmeans" + str(k) + "/" print "saving feats to :", feat_save_path utils.create_dir_if_not_exist(feat_save_path) vid_ids = utils.read_file_to_list(vid_ids_path) vid_clips_list = [vid[:-4] for vid in vid_ids] assert len(vid_ids) == num_vids for vid in vid_clips_list: # print("loading features from : "+vid) vid_feats_path = config.MSVD_FEATS_DIR + cnn + "/" + vid + ".npy" vid_feats = np.load(vid_feats_path) # print(vid_feats.shape) kmeans = KMeans(n_clusters=k, init='k-means++', random_state=0).fit(vid_feats) vid_feat_kmeans = kmeans.cluster_centers_ # print(vid_feat_kmeans.shape) np.save(feat_save_path + vid + ".npy", vid_feat_kmeans)
def save_feats(whichdata): feat_save_path = config.MURALI_MSVD_FEATS_DIR print "saving feats to :", feat_save_path utils.create_dir_if_not_exist(feat_save_path) if whichdata == "train": encoded_feats_path = config.MURALI_MSVD_ENCODED_FEATS_TRAIN dictsize = config.MURALI_TRAIN_VIDS elif whichdata == "test": encoded_feats_path = config.MURALI_MSVD_ENCODED_FEATS_TEST dictsize = config.MURALI_TEST_VIDS else: raise NotImplementedError() encoded_video = np.loadtxt(encoded_feats_path, delimiter=',') print(encoded_video.shape) num, dim = encoded_video.shape assert num == dictsize for vid_id in range(num): vid_feats = encoded_video[vid_id].reshape(32, 1024) # print(vid_feats.shape) np.save(feat_save_path + whichdata + "_" + str(vid_id) + ".npy", vid_feats)
def transform_lfw(input_dir_path, output_dir_path, size, mode, split=True, ratio=0.9): """ :param ratio: how much of train set we use if split is True :param split: should we split into train set and test set :param input_dir_path: input path to lfw images (in default lfw format) :param output_dir_path: output dir for processed images :param size: should be tuple (x, y) opencv style :param mode: grayscale supported, for others nothing happens :return: """ images_paths = [] create_dir_if_not_exist(output_dir_path) for dir_name in os.listdir(input_dir_path): dir_path = os.path.join(input_dir_path, dir_name) if os.path.isdir(dir_path): images_names = [img_name for img_name in os.listdir(dir_path) if img_name.endswith('.jpg')] for img_name in images_names: img_path = os.path.join(dir_path, img_name) images_paths.append(img_path) shuffle(images_paths) if split: train_dir = os.path.join(output_dir_path, 'train') test_dir = os.path.join(output_dir_path, 'test') create_dir_if_not_exist(train_dir) create_dir_if_not_exist(test_dir) train_end = int(len(images_paths) * ratio) train_images_paths = images_paths[:train_end] test_images_paths = images_paths[train_end:] transform(train_images_paths, size, train_dir, mode) transform(test_images_paths, size, test_dir, mode) else: transform(images_paths, size, output_dir_path, mode)
def train_from_scratch(config, state, channel): model_type = config.model # set up automatically some fields in config if config.dataset.signature == 'MNIST_binary_russ': config[model_type].n_in = 784 config[model_type].n_out = 784 # manipulate the 'state # save the config file save_model_path = config.save_model_path if save_model_path == 'current': config.save_model_path = './' # to facilitate the use of cluster for multiple jobs save_path = './model_config.pkl' else: # run locally, save locally save_path = save_model_path + 'model_config.pkl' utils.create_dir_if_not_exist(config.save_model_path) # for stdout file logging #sys.stdout = Unbuffered(sys.stdout, state.save_model_path + 'stdout.log') print 'saving model config into %s' % save_path utils.dump_pkl(config, save_path) # Also copy back from config into state. for key in config: setattr(state, key, config[key]) print 'Model Type: %s' % model_type print 'Host: %s' % socket.gethostname() print 'Command: %s' % ' '.join(sys.argv) print 'initializing data engine' input_dtype = 'float32' target_dtype = 'int32' data_engine = None deep_orderless_bernoulli_nade.train_from_scratch(state, data_engine, channel)
def train_from_scratch(config, state, channel): model_type = config.model # set up automatically some fields in config if config.dataset.signature == 'MNIST_binary_russ': config[model_type].n_in = 784 config[model_type].n_out = 784 # manipulate the 'state # save the config file save_model_path = config.save_model_path if save_model_path == 'current': config.save_model_path = './' # to facilitate the use of cluster for multiple jobs save_path = './model_config.pkl' else: # run locally, save locally save_path = save_model_path + 'model_config.pkl' utils.create_dir_if_not_exist(config.save_model_path) # for stdout file logging #sys.stdout = Unbuffered(sys.stdout, state.save_model_path + 'stdout.log') print 'saving model config into %s'%save_path utils.dump_pkl(config, save_path) # Also copy back from config into state. for key in config: setattr(state, key, config[key]) print 'Model Type: %s'%model_type print 'Host: %s' % socket.gethostname() print 'Command: %s' % ' '.join(sys.argv) print 'initializing data engine' input_dtype = 'float32' target_dtype = 'int32' data_engine = None deep_orderless_bernoulli_nade.train_from_scratch(state, data_engine, channel)
def random_comparison(arguments, rep_num=0): # esegue i confronti random matrices_extractors, genes, gene_list = arguments matrix_01_pair = [] for matrices_extractor in matrices_extractors: bed_file_name = matrices_extractor["bed_file_name"] me = matrices_extractor["me"] # extract the matrices # print("random comparison num: " + str(rep_num) + " extract 01 and coverage from: " + str(bed_file_name)) # start = time.time() matrix_01_pair.append({'matrix': me.extract_matrices(areReadsRandomized=True, add_small_random_value=add_small_random_value_to_random_comparison, rep_num=rep_num), 'file_name': bed_file_name}) # print ("end in " + str(time.time() - start) + " sec(s)") if save_matrix_01: create_dir_if_not_exist([random_comparisons_folder_matrix_01]) save_dir = os.path.join(random_comparisons_folder_matrix_01, bed_file_name) create_dir_if_not_exist([save_dir]) matrix_01.to_csv(os.path.join(save_dir, str(rep_num) + ".csv"), index=True, header=True, decimal='.', sep=',', float_format='%.6f') # print(" start compare pairs ") # start = time.time() match_score, pair_names = compare_pair(matrix_01_pair, genes.set_index('GeneID'), gene_list) # compara coppie di file bed (di matrici 0-1) pair_names = Path(pair_names[0]).stem + ":" + Path(pair_names[1]).stem # print("end_comparison in " + str(time.time() - start) + "seconds") return {'pair_name': pair_names, 'match_score': match_score}
def compare_pair_n_times_serial(bed_files_pair, genes, gene_list, n): # per ogni coppia di file bed esegue n confronti # extract a pair of bed files match_scores = [] matrices_extractors = [] for bed_files_dict in bed_files_pair: bed_file = bed_files_dict["bed_file"] bed_file_name = bed_files_dict["bed_file_name"] me = MatricesExtractor(bed_file, genes, bed_file_name) matrices_extractors.append({"me": me, "bed_file_name": bed_file_name}) arguments = matrices_extractors, genes, gene_list for i in range(n): match_scores.append(random_comparison(arguments, i)) # effettua n confronti random if save_random_match_scores: create_dir_if_not_exist([random_comparisons_folder_match_scores]) for i in range(len(match_scores)): save_dir = os.path.join(random_comparisons_folder_match_scores, match_scores[i]["pair_name"]) create_dir_if_not_exist([save_dir]) match_scores[i]["match_score"].to_csv(os.path.join(save_dir, str(i) + ".csv"), index=True, header=True, decimal='.', sep=',', float_format='%.6f') return match_scores
def compare_pair_n_times(bed_files_pair, genes, gene_list, n): # extract a pair of bed files match_scores = [] matrices_extractors = [] for bed_files_dict in bed_files_pair: # FIX ME creazione delle classi estrattori a monte, verificare correttezza bed_file = bed_files_dict["bed_file"] bed_file_name = bed_files_dict["bed_file_name"] me = MatricesExtractor(bed_file, genes) matrices_extractors.append({"me": me, "bed_file_name": bed_file_name}) arguments = matrices_extractors, genes, gene_list pool = multiprocessing.Pool(processes=num_task) res = [] for i in range(n): res.append(pool.apply_async(random_comparison, [arguments])) pool.close() pool.join() for i in res: match_scores.append(i.get()) if save_random_match_scores: create_dir_if_not_exist([random_comparisons_folder_match_scores]) for i in range(len(match_scores)): save_dir = os.path.join(random_comparisons_folder_match_scores, match_scores[i]["pair_name"]) create_dir_if_not_exist([save_dir]) match_scores[i]["match_score"].to_csv(os.path.join( save_dir, str(i) + ".csv"), index=True, header=True, decimal='.', sep=',', float_format='%.6f') return match_scores
input_data = { 'raw_spectrograms': np.array(raw_spectrograms), 'std_spectrograms': np.array(std_spectrograms), 'file_name': np.array(mat_file_names), 'segment': pdata['segment'] } if pdata['dtype'] == 'train': input_data['target'] = pdata['target'] np.save(os.path.join(output_dir, 'data.npy'), input_data) def collect_and_write_data(p_idx): pid = int(p_idx / 2) is_test = p_idx % 2 != 0 # write training data pdata = utils.load_data(pid, is_test=is_test) write_patient_data(pdata) return 1 if __name__ == "__main__": create_dir_if_not_exist(utils.output_directory) result = futures.map(collect_and_write_data, range(6)) print list(result)
in_features=n_features, out_features=1, regression=True) net = net.to(device) optimizer = optim.Adam(net.parameters(), lr=opt.lr, betas=(0.9, 0.999)) scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.1) now = datetime.datetime.now() folder_name = os.path.join('modelnet_regression_log', opt.model, 'width' + str(width)) start_run_time = str(now.day) + str(now.month) + str(now.hour) + str( now.minute) filename = 'optimize=' + str(optimizer)[:3] + '.csv' utils.create_dir_if_not_exist(os.path.join(opt.outf, folder_name)) logpath = os.path.join(opt.outf, folder_name, filename) cols = ['epoch', 'train_loss', 'test_loss'] logger = utils.Logger(logpath, cols) loss_func = nn.MSELoss() nepoch = 50 for epoch in range(nepoch): epoch_loss = 0.0 # scheduler.step() for step, data in enumerate(dataloader): # for each training step if opt.model == "GraphNet": batch_x, A, batch_y = data A = A.float()
def calc_reproducible_sequences(match_scores_list, gene_list, pair_names_list, match_scores_real, matrix_01_list): # compute the match score histograms for the random comparisons match_scores_hist = {} for fake_match_scores in match_scores_list: for fake_match_score in fake_match_scores: # fake_match_score contains the scores of one pair pair_name = fake_match_score['pair_name'] match_scores_fake = fake_match_score['match_score'] gene_hist = {} for gene, match_score in match_scores_fake.items(): gene_hist[gene] = [match_score] if pair_name in match_scores_hist: for gene, match_score in match_scores_fake.items(): match_scores_hist[pair_name][gene].append(match_score) # = [match_score] else: match_scores_hist[pair_name] = gene_hist p_value_matrix = pd.DataFrame(index=gene_list, columns=pair_names_list) plot_num = 0 # extract pvalues for each gene and dataset pair for pair_name in match_scores_hist: for gene in match_scores_hist[pair_name]: gene_hist = pd.Series(match_scores_hist[pair_name][gene]) hist_mean = np.mean(gene_hist) hist_std = np.std(gene_hist) if plot_data: match_scores_hist_pair_plot_folder = os.path.join(match_scores_hist_plot_folder, pair_name) create_dir_if_not_exist([match_scores_hist_pair_plot_folder]) sns.set_style('darkgrid') plot = sns.distplot(gene_hist, bins=num_bins).set_title("hist_mean: " + str('%.5f' % hist_mean) + " hist_std: " + str('%.5f' % hist_std)) plot.get_figure().savefig(os.path.join(match_scores_hist_pair_plot_folder, "gene:" + gene)) plot.get_figure().clf() for match_score_real in match_scores_real: pair_name_real = match_score_real["pair_name"] if pair_name_real == pair_name: real_score = match_score_real["match_score"][gene] z_score = (real_score - hist_mean) / hist_std pvalue = st.norm.sf(abs(z_score)) p_value_matrix[pair_name][gene] = pvalue # p_value_matrix[gene][pair_name] = pvalue # if (plot_num < 3): # print(gene) # print(pair_name) # plt.figure() # gene_hist.plot.hist(grid=True, bins=10, rwidth=0.9, color='#607c8e') # plt.show() plot_num += 1 reproducible_genes = [] for gene, pvalue_row in p_value_matrix.iterrows(): pvalue_row = pvalue_row.to_numpy() y = multipletests(pvals=pvalue_row, alpha=FDR, method="fdr_bh") number_of_significative_values_python = len(y[1][np.where(y[1] < FDR)]) # # print("gene") # print(gene) # print("pvalue row") # print(pvalue_row) # print("Benjamini-Hockberg thresholds") # print(y[1]) # print("number of significative values") # print(number_of_significative_values) # # # if all the pvalues are below the threshold for each dataset then the gene can be considered reproducible # if number_of_significative_values == len(pair_names_list): # reproducible_genes.append(gene) pvalue_row = np.sort(pvalue_row) critical_values = ((np.nonzero(pvalue_row >= 0)[0] + 1) / len(pair_names_list)) * FDR bh_candidates = pvalue_row[pvalue_row <= critical_values] # print ("funzione multipletests:" + str(number_of_significative_values_python)+" funzione di davide:"+str(len(bh_candidates))) if len(bh_candidates) > 0: idx_of_max_value = np.argwhere(bh_candidates == np.amax(bh_candidates)).flatten().tolist()[-1] + 1 bh_selected = pvalue_row[np.array(range(0, idx_of_max_value))] if len(bh_selected) == len(pair_names_list): reproducible_genes.append(gene) reproducible_sequence_mask, first_matrix_01_with_only_reproducible_genes = extract_reproducible_sequences(reproducible_genes, matrix_01_list) # take the first matrix 01 with only reproducible genes and put to zero the non reproducible parts first_matrix_01_with_only_reproducible_genes[~reproducible_sequence_mask] = 0 reproducible_sequence = pd.DataFrame(first_matrix_01_with_only_reproducible_genes, index=reproducible_genes) reproducible_sequence.to_csv(os.path.join(reproducible_sequence_output_dir, "reproducible_sequence.csv"), index=True, header=True, decimal='.', sep=',', float_format='%.6f')
num_task = num_cores - 1 plot_data = True num_bins = 15 # I/O directories input_dir = os.path.join(os.getcwd(), "check_reduced/") # get the path to the data input directory match_scores_output_dir = os.path.join(os.getcwd(), "matrix_python/match_scores/") # Sets the directory where all the saved outputs will be stored reproducible_sequence_output_dir = os.path.join(os.getcwd(), "matrix_python/reproducible_sequence/") # Sets the directory where all the saved outputs will be stored genes_lengths_path = os.path.join(os.getcwd(), "gene_lengths.csv") # path to upload the file containing each gene's ID and the correspondent gene length histogram_plot_path = os.path.join(os.getcwd(), "genes_histograms/") # path to upload the file containing each gene's ID and the correspondent gene length intermediate_results = os.path.join(os.getcwd(), "intermediate_results/") plots_folder = os.path.join(os.getcwd(), "plots/") match_scores_hist_plot_folder = os.path.join(plots_folder, "match_scores_hist/") match_coverage_hist_plot_folder = os.path.join(plots_folder, "coverage_hist/") path_match_score_csv = os.path.join(os.getcwd(), "path_match_score_csv/") create_dir_if_not_exist([input_dir, match_scores_output_dir, histogram_plot_path, reproducible_sequence_output_dir, intermediate_results, plots_folder, match_scores_hist_plot_folder,path_match_score_csv]) FDR = 0.01 def signal_digitalisation(genes, bed_files_dicts, areReadsRandomized, add_small_random_value): matrix_01_list = [] for bed_files_dict in bed_files_dicts: bed_file = bed_files_dict["bed_file"] bed_file_name = bed_files_dict["bed_file_name"] me = MatricesExtractor(bed_file, genes) # extract the matrices pd_matrix_coverage, matrix_01 = me.extract_matrices(areReadsRandomized=areReadsRandomized, add_small_random_value=add_small_random_value) if plot_data: for gene, coverage in pd_matrix_coverage.iterrows(): coverage = coverage[~np.isnan(coverage)]
) #matrix_python_CONTROL_all/reproducible_sequence/")#"matrix_python_472/reproducible_sequence/") # Sets the directory where all the saved outputs will be stored genes_lengths_path = os.path.join( os.getcwd(), "../../CDShumanGenesLengths.txt" ) # path to upload the file containing each gene's ID and the correspondent gene length histogram_plot_path = os.path.join( os.getcwd(), "genes_histograms_LC509_nosoglia_27/" ) # path to upload the file containing each gene's ID and the correspondent gene length intermediate_results = os.path.join(os.getcwd(), "intermediate_results_LC509_nosoglia_27/") random_comparisons_folder_match_scores = os.path.join( os.getcwd(), "random_comparisons_folder_match_scores_LC509_nosoglia/") random_comparisons_folder_matrix_01 = os.path.join( os.getcwd(), "random_comparisons_folder_matrix_01_LC509_nosoglia/") create_dir_if_not_exist([ input_dir, match_scores_output_dir, histogram_plot_path, reproducible_sequence_output_dir, intermediate_results ]) num_comparison = 200 #200 e 50 per fare 10000 # NOTA: numero di confronti random da eseguire per ogni coppia di file bed save_random_match_scores = False save_matrix_01 = False def extract_gene_list(genes, bed_files_dicts): gene_lists = [] for bed_files_dict in bed_files_dicts: bed_file = bed_files_dict["bed_file"] table_FP = bed_file["Chromosome"].value_counts().sort_index( ).rename_axis('GeneID').reset_index(name='ReadsCounts') table_FP_Geneslengths = pd.merge(table_FP, genes, on="GeneID")
# BoXHED 2.0 (https://arxiv.org/pdf/2103.12591.pdf) is a software package # for estimating hazard functions nonparametrically via gradient boosting. # It is orders of magnitude faster than BoXHED 1.0 (http://proceedings.mlr.press/v119/wang20o/wang20o.pdf). # BoXHED 2.0 also allows for more general forms of survival data including recurrent events. #This tutorial demonstrates how to apply BoXHED 2.0 to a synthetic dataset. DATA_ADDRESS = "./data/" # train/test data directory RSLT_ADDRESS = "./results/" # results directory nthread_prep = 20 # number of CPU threads used for preprocessing nthread_train = 20 # number of CPU threads used for training # Create the results' directory if it does not exist for addr in [RSLT_ADDRESS]: create_dir_if_not_exist(addr) # Function: read_train_data # Reads the synthetic training data. # Input: # None # Return: # @ A pandas dataframe containing training data with the following columns: # * ID: subject ID # * t_start: the start time of an epoch for the subject # * t_end: the end time of the epoch # * X_i: values of covariates between t_start and t_end # Sample Output: # ID t_start t_end X_0 delta # 1 0.010000 0.064333 0.152407 0.0
def signal_digitalisation(genes, bed_files_dicts, areReadsRandomized, add_small_random_value): matrix_01_list = [] for bed_files_dict in bed_files_dicts: bed_file = bed_files_dict["bed_file"] bed_file_name = bed_files_dict["bed_file_name"] # extract the matrix coverage and the matrix 01 for each bed file me = MatricesExtractor(bed_file, genes, bed_file_name) pd_matrix_01, pd_matrix_coverage = me.extract_matrices( areReadsRandomized=areReadsRandomized, add_small_random_value=add_small_random_value) pd_matrix_coverage.to_csv(os.path.join(matrix_coverage_real_dir, bed_file_name + ".csv"), index=True, header=True, decimal='.', sep=',', float_format='%.6f') if plot_data: # plot the matrix coverage for some specific genes for gene, coverage in pd_matrix_coverage.iterrows(): coverage = coverage[~np.isnan(coverage)] match_scores_hist_pair_plot_folder = os.path.join( match_coverage_hist_plot_folder, bed_file_name) create_dir_if_not_exist([match_scores_hist_pair_plot_folder]) x = range(0, len(coverage)) fig, ax = plt.subplots() # if gene == "ENST00000367755.9" or gene == "ENST00000200639.9" or gene == "ENST00000301522.3" or gene == "ENST00000371634.7"or gene == "ENST00000361789.2" or gene == "ENST00000239938.5" or gene == "ENST00000220763.10" or gene == "ENST00000273258.4" : plot = sns.lineplot(x, coverage, color='black') plot.fill_between(x, coverage, color='black') plot = sns.lineplot(x, coverage.median(), color='orange', hue=coverage.median(), palette=["C0"]) ax.legend(title="Median") plot.set(xticks=((x[0::int(len(coverage) * 0.08)]))) plot.get_figure().savefig( os.path.join(match_scores_hist_pair_plot_folder, "gene:" + gene + ".pdf")) plot.get_figure().clf() # plot the matrix 01 for some specific genes for gene, matrix_01 in pd_matrix_01.iterrows(): matrix_01 = matrix_01[~np.isnan(matrix_01)] hist_01_hist_pair_plot_folder = os.path.join( hist_01_plot_folder, bed_file_name) create_dir_if_not_exist([hist_01_hist_pair_plot_folder]) # # print_full(coverage) x = range(0, len(matrix_01)) # if gene == "ENST00000367755.9" or gene == "ENST00000200639.9" or gene == "ENST00000301522.3" or gene == "ENST00000371634.7"or gene =="ENST00000361789.2" or gene == "ENST00000239938.5" : plot = sns.lineplot(x, matrix_01, color='black') plot.fill_between(x, matrix_01, color='black') # plot.set(xticks=((x[0::int(len(matrix_01) * 0.08)]))) # plot.get_figure().savefig( os.path.join(hist_01_hist_pair_plot_folder, "gene:" + gene + ".pdf")) plot.get_figure().clf() matrix_01.to_csv(os.path.join(hist_01_hist_pair_plot_folder, "gene" + gene + ".csv"), index=True, header=True, decimal='.', sep=',', float_format='%.6f') matrix_01_list.append({ 'matrix': pd_matrix_01, 'file_name': bed_file_name }) return matrix_01_list
if exp_num == 2: return beta.ppf(-math.log(u) / beta.pdf(x, 4, 4) + beta.cdf(t0, 4, 4), 4, 4) if exp_num == 3: return np.exp(x - norm.ppf(u * norm.cdf(x - math.log(t0)))) if exp_num == 4: return np.power( -math.log(u) / (np.exp(-0.5 * math.cos(2 * math.pi * x) - 1.5)) + np.power(t0, 1.5), 2 / 3) file_addr = "./synth_files/" create_dir_if_not_exist(file_addr) num_irr = 40 t_min = 0.01 t_max = {1: 1, 2: 1, 3: 5, 4: 5} num_pcs = 10 max_size = int(14e6) n_sub = {'train': 1000000, 'test': 5000} seed = {'train': 0, 'test': 1} def set_subj_1_to_N(data): subject_converter = dict( zip(sorted(data['subject'].unique()),
def calc_reproducible_sequences(match_scores_list, gene_list, pair_names_list, match_scores_real, matrix_01_list): # compute the match score histograms for the random comparisons # match_score_list contains all the match scores computed during the random comparisons match_scores_hist = {} for fake_match_scores in match_scores_list: for fake_match_score in fake_match_scores: pair_name = fake_match_score['pair_name'] match_scores_fake = fake_match_score['match_score'] # match_scores_hist aggregates the match scores indexing by pair name (name of the pair of bed files) and gene gene_hist = {} for gene, match_score in match_scores_fake.items(): gene_hist[gene] = [match_score] if pair_name in match_scores_hist: for gene, match_score in match_scores_fake.items(): match_scores_hist[pair_name][gene].append(match_score) # = [match_score] else: match_scores_hist[pair_name] = gene_hist # the pvale matrix contains the pvalues indexed by the name of pair of bed files and the name of the gene p_value_matrix = pd.DataFrame(index=gene_list, columns=pair_names_list) # the matrix summary is a pandas dataframe that summarizes the mean, standard deviation, pvalue and zscores of each gene matrix_summary_columns = [[c + "_mean", c + "_std", c + "_zscore", c + "_pvalue"] for c in pair_names_list] matrix_summary_columns = np.reshape(matrix_summary_columns, (np.shape(matrix_summary_columns)[0] * np.shape(matrix_summary_columns)[1])).T matrix_summary = pd.DataFrame(index=gene_list, columns=matrix_summary_columns) plot_num = 0 # extract pvalues for each gene and dataset pair for pair_name in match_scores_hist: for gene in match_scores_hist[pair_name]: # compute mean and standard deviation for each gene gene_hist = pd.Series(match_scores_hist[pair_name][gene]) hist_mean = np.mean(gene_hist) hist_std = np.std(gene_hist) if plot_data: # plot the histogram of match scores for each gene match_scores_hist_pair_plot_folder = os.path.join(match_scores_hist_plot_folder, pair_name) create_dir_if_not_exist([match_scores_hist_pair_plot_folder]) sns.set_style('darkgrid') plot = sns.distplot(gene_hist, bins=num_bins).set_title("hist_mean: " + str('%.5f' % hist_mean) + " hist_std: " + str('%.5f' % hist_std)) plot.get_figure().savefig(os.path.join(match_scores_hist_pair_plot_folder, "gene:" + gene)) plot.get_figure().clf() # compute the pvalues of each gene for each pair of bed files for match_score_real in match_scores_real: pair_name_real = match_score_real["pair_name"] if pair_name_real == pair_name: real_score = match_score_real["match_score"][gene] if hist_std != 0: # compute the zscore and corresponding pvalue z_score = (real_score - hist_mean) / hist_std pvalue = 1 - st.norm.cdf(z_score) else: z_score = np.nan pvalue = np.nan # insert the results in the matrix summary matrix_summary[pair_name + "_mean"][gene] = hist_mean matrix_summary[pair_name + "_std"][gene] = hist_std matrix_summary[pair_name + "_zscore"][gene] = z_score matrix_summary[pair_name + "_pvalue"][gene] = pvalue p_value_matrix[pair_name][gene] = pvalue plot_num += 1 reproducible_genes = [] matrix_summary.to_csv(os.path.join(reproducible_sequence_output_dir, "matrix_summary_005.csv"), index=True, header=True, decimal='.', sep=',', float_format='%.6f') for gene, pvalue_row in p_value_matrix.iterrows(): pvalue_row = pvalue_row.to_numpy() # test the pvalues with Benjamini/Hochberg # python version # use multipletest from python stats library, results are the same as in R script: number_of_significative_values_python) ==len(bh_candidates) # y = multipletests(pvals=pvalue_row, alpha=FDR, method="fdr_bh") # number_of_significative_values_python = len(y[1][np.where(y[1] < FDR)]) # # if all the pvalues are below the threshold for each dataset then the gene can be considered reproducible # if number_of_significative_values == len(pair_names_list): # reproducible_genes.append(gene) # R version pvalue_row = np.sort(pvalue_row) critical_values = ((np.nonzero(pvalue_row >= 0)[0] + 1) / len(pair_names_list)) * FDR # select the gene only if the number of selected elements is above the threshold reproducibily_threshold = int(round((len(pair_names_list) * reproducibility_min_fraction))) #print(reproducibily_threshold) if len(critical_values) > 0: # remove nan from pvalue row pvalue_row = pd.to_numeric(pvalue_row, errors='coerce') pvalue_row = pvalue_row[np.logical_not(np.isnan(pvalue_row))] bh_candidates = pvalue_row[pvalue_row <= critical_values] if len(bh_candidates) > 0: idx_of_max_value = np.argwhere(bh_candidates == np.amax(bh_candidates)).flatten().tolist()[-1] + 1 bh_selected = pvalue_row[np.array(range(0, idx_of_max_value))] if len(bh_selected) >= reproducibily_threshold: reproducible_genes.append(gene) # extract the reproducible sequences reproducible_sequence_mask, first_matrix_01_with_only_reproducible_genes = extract_reproducible_sequences(reproducible_genes, matrix_01_list) # take the first matrix 01 with only reproducible genes and put to zero the non reproducible parts first_matrix_01_with_only_reproducible_genes[~reproducible_sequence_mask] = 0 reproducible_sequence = pd.DataFrame(first_matrix_01_with_only_reproducible_genes, index=reproducible_genes) reproducible_sequence.to_csv(os.path.join(reproducible_sequence_output_dir, "reproducible_sequence_005.csv"), index=True, header=True, decimal='.', sep=',', float_format='%.6f') p_value_matrix.to_csv(os.path.join(reproducible_sequence_output_dir, "global_matrix_005.csv"), index=True, header=True, decimal='.', sep=',', float_format='%.6f')