def t_test_and_pickle(fnames_dic, chromo, output_dir, group_1_samples, group_2_samples, main_dir, queue_obj, min_counts): ''' Combines several modules together into one so that the process can be easily multithreaded. Return a dictionary containing chromosomes as keynames as fnames as values. ''' # Define constants pval_str = 'pval' event_str = 'event' # Define output dic # DEBUG fnames_dic = {} # Create directory to store pickled dictionary. make_dir(os.path.join(output_dir, chromo)) ''' # Get list of AS events that need to be t-tested. # Run the function on the lists separately to ensure # that each list contains at least one element. # This means our master_fnames_list is guaranteed to # have one sample in each group. ''' group_1_fnames_list = get_all_fnames(group_1_samples, main_dir, chromo) group_2_fnames_list = get_all_fnames(group_2_samples, main_dir, chromo) master_fnames_list = group_1_fnames_list + group_2_fnames_list # Remove repeats master_fnames_list = list(set(master_fnames_list)) # master_fnames_size = len(master_fnames_list) # Do t-test between the two groups. fnames_pickled_list = [] count = 0 for fname in master_fnames_list: count += 1 # Get dictionary containing psi information for all samples. psi_info_dic, _ = get_psi_dic_across_samples(fname, group_1_samples, group_2_samples, main_dir, chromo, output_dir, min_counts) # Add pval and event to dic psi_info_dic[pval_str] = [t_test_psi_info(psi_info_dic)] # Remove .miso from fname to get event name. psi_info_dic[event_str] = [fname.split('.')[0]] # Save dictionary as a pickle file. # add .pickle to fname pickled_fname = ''.join([fname, '.pickle']) output_fullpath = os.path.join(output_dir, chromo, pickled_fname) fnames_pickled_list.append( save_dic_as_pickle(psi_info_dic, output_fullpath)) # save fnames list to output dic if chromo not in fnames_dic: fnames_dic[chromo] = fnames_pickled_list else: print('Warning, overwriting fnames_list in %s' % chromo) print('T-tested %s events in %s' % (count, chromo)) queue_obj.put(fnames_dic) # For multithreading
def consolidate_miso_across_samples(main_dir, sample_dir_names_list, chromo, master_fnames_list, output_path): ''' For each file in master_fnames_list: Read inside main_dir/sample_dir/chromo/misofile for all sample_dirs. 1) read the header and write a combined header to the output_path (meaning adding up all the counts). 2) write sampled_psi and log_score to output_path for each sample. TODO: is it weird to put csv writer obj into a function? ''' # This will be inefficient, because I will open each file twice... file_count = 0 for f in master_fnames_list: # Construct fullpath for output file, make directory if does not exist. # Path will look like output_path/chr1/ chr_out_path = make_dir(os.path.join(output_path, chromo)) with open(os.path.join(chr_out_path, f), 'wb') as writefile: writer = csv.writer(writefile, delimiter='\t') # Write summary header of file (loops through all samples) write_combined_miso_header(sample_dir_names_list, main_dir, chromo, f, writer) # Write sampled_psi and log_score for each sample: write_combined_psi_logscore(sample_dir_names_list, main_dir, chromo, f, writer) file_count += 1
def t_test_and_pickle(fnames_dic, chromo, output_dir, group_1_samples, group_2_samples, main_dir, queue_obj, min_counts): ''' Combines several modules together into one so that the process can be easily multithreaded. Return a dictionary containing chromosomes as keynames as fnames as values. ''' # Define constants pval_str = 'pval' event_str = 'event' # Define output dic # DEBUG fnames_dic = {} # Create directory to store pickled dictionary. make_dir(os.path.join(output_dir, chromo)) ''' # Get list of AS events that need to be t-tested. # Run the function on the lists separately to ensure # that each list contains at least one element. # This means our master_fnames_list is guaranteed to # have one sample in each group. ''' group_1_fnames_list = get_all_fnames(group_1_samples, main_dir, chromo) group_2_fnames_list = get_all_fnames(group_2_samples, main_dir, chromo) master_fnames_list = group_1_fnames_list + group_2_fnames_list # Remove repeats master_fnames_list = list(set(master_fnames_list)) # master_fnames_size = len(master_fnames_list) # Do t-test between the two groups. fnames_pickled_list = [] count = 0 for fname in master_fnames_list: count += 1 # Get dictionary containing psi information for all samples. psi_info_dic, _ = get_psi_dic_across_samples(fname, group_1_samples, group_2_samples, main_dir, chromo, output_dir, min_counts) # Add pval and event to dic psi_info_dic[pval_str] = [t_test_psi_info(psi_info_dic)] # Remove .miso from fname to get event name. psi_info_dic[event_str] = [fname.split('.')[0]] # Save dictionary as a pickle file. # add .pickle to fname pickled_fname = ''.join([fname, '.pickle']) output_fullpath = os.path.join(output_dir, chromo, pickled_fname) fnames_pickled_list.append(save_dic_as_pickle(psi_info_dic, output_fullpath)) # save fnames list to output dic if chromo not in fnames_dic: fnames_dic[chromo] = fnames_pickled_list else: print('Warning, overwriting fnames_list in %s' %chromo) print('T-tested %s events in %s' %(count, chromo)) queue_obj.put(fnames_dic) # For multithreading