def main(config_file): #Greeting now_time('Beginning EndClip Prep run (v0.1.0)') print("-"*50) #Parse configure file now_time("Parsing configure file...") config_dict = Input_config_file(config_file) #Check configure file if not 'gene_gtf_file' in config_dict: sys.exit("ERROR: Gene GTF file does not exist...") if not 'gene_bed_file' in config_dict: sys.exit("ERROR: Gene BED file does not exist...") if not 'pA_site_file' in config_dict: sys.exit("ERROR: pA site file does not exist...") if not 'output_kfXref_file' in config_dict: sys.exit("ERROR: output_kfXref_file was not designated...") if not 'output_utr_file' in config_dict: sys.exit("ERROR: output_utr_file does not exist...") if not 'output_initial_UTR_database_file' in config_dict: sys.exit("ERROR: output_initial_UTR_database_file was not designated...") #Extract information gene_gtf_file = config_dict['gene_gtf_file'] gene_bed_file = config_dict['gene_bed_file'] pA_site_file = config_dict['pA_site_file'] output_kfXref_file = config_dict['output_kfXref_file'] output_utr_file = config_dict['output_utr_file'] output_initial_UTR_database_file = config_dict['output_initial_UTR_database_file'] #Prepare symbol_refid_map_file(kfXref file) ''' now_time("Prepare kfXref file...") symbol_refid_map = Extract_gene_symbol_map_kfXref_file(gene_gtf_file) map_file = open(output_kfXref_file, 'w') for refid in symbol_refid_map.keys(): symbol = symbol_refid_map[refid] print(refid, symbol, sep="\t", end="\n", file=map_file) ''' #Make initial 3'UTR database now_time("Prepare initial 3'UTR database...") test = Extract_3UTR_from_bed(gene_bed_file, output_kfXref_file, output_utr_file) ##Prepare 3UTR database with pA site information #now_time("Prepare 3'UTR database with pA site information...") #temp_file = output_initial_UTR_database_file + '.tmp' #cmd = 'bedtools intersect -a %s -b %s -wa -wb > %s' % (output_utr_file, pA_site_file, temp_file) #os.system(cmd) #Merge_pA_site_infor(temp_file, output_initial_UTR_database_file) now_time("Completely finished!!")
def main(): '''Input_configure_file Annotated_3UTR=data/hg19_refseq_extracted_3UTR_PTEN_ELAVL1.bed PolyA_site_infor=data/polyA_DB_hg19.bed Group1_Tophat_aligned_Wig=data/siCTRL_S_accepted_hits_PTEN_ELAVL1.bam.wig Group2_Tophat_aligned_Wig=data/siCFIm25-1_accepted_hits_PTEN_ELAVL1.bam.wig Output_directory=DaPars_Test_data/ Output_result_file=DaPars_Test_data #Parameters Num_least_in_group1=1 Num_least_in_group2=1 Coverage_cutoff=30 FDR_cutoff=0.05 PDUI_cutoff=0.5 Fold_change_cutoff=0.59 ''' start_time = time.time() now_time("Beginning EndSnip run (v0.1.0)") print("-"*50) if len(sys.argv) == 1: sys.exit("ERROR: Please provide the configure file...") cfg_file = sys.argv[1] now_time("Parsing configure file...") config_dict = Input_config_file(cfg_file) #Check configure file if not 'Group1_Tophat_aligned_Wig' in config_dict: sys.exit("ERROR: No Tophat aligned BAM file for group 1...") if not 'Group2_Tophat_aligned_Wig' in config_dict: sys.exit("ERROR: No Tophat aligned BAM file for group 2...") if not 'Output_directory' in config_dict: sys.exit("ERROR: No output directory...") if not 'Annotated_3UTR' in config_dict: sys.exit("ERROR: No annotated 3'UTR file...") if not 'Output_result_file' in config_dict: sys.exit("ERROR: No result file name...") #File/Directory Group1_Tophat_aligned_file = config_dict['Group1_Tophat_aligned_Wig'].split(',') Group2_Tophat_aligned_file = config_dict['Group2_Tophat_aligned_Wig'].split(',') output_directory = config_dict['Output_directory'] if output_directory[-1] != '/': output_directory += '/' Annotated_3UTR_file = config_dict['Annotated_3UTR'] Output_result_file = config_dict['Output_result_file'] #Default parameters global Num_least_in_group1 global Num_least_in_group2 global Coverage_cutoff global FDR_cutoff global Fold_change_cutoff global PDUI_cutoff global Coverage_pPAS_cutoff Num_least_in_group1 = 1 Num_least_in_group2 = 1 Coverage_cutoff = 30 FDR_cutoff = 0.05 Fold_change_cutoff = 0.59 #1.5-fold change PDUI_cutoff = 0.2 Coverage_pPAS_cutoff = 5.0 #Check parameters if not 'Num_least_in_group1' in config_dict: print(" Num_least_in_group1: Default parameter(1) was designated.") else: Num_least_in_group1 = float(config_dict['Num_least_in_group1']) if not 'Num_least_in_group2' in config_dict: print(" Num_least_in_group2: Default parameter(1) was designated.") else: Num_least_in_group2 = float(config_dict['Num_least_in_group2']) if not 'Coverage_cutoff' in config_dict: print(" Coverage_cutoff: Default parameter(30) was designated.") else: Coverage_cutoff = float(config_dict['Coverage_cutoff']) if not 'FDR_cutoff' in config_dict: print(" FDR_cutoff: Default parameter(0.05) was designated.") else: FDR_cutoff = float(config_dict['FDR_cutoff']) if not 'Fold_change_cutoff' in config_dict: print(" Fold_change_cutoff: Default parameter(0.59[log2]/1.5-fold) was designated.") else: Fold_change_cutoff = config_dict['Fold_change_cutoff'] if not 'PDUI_cutoff' in config_dict: print(" PDUI_cutoff: Default parameter(0.2) was designated.") else: PDUI_cutoff = float(config_dict['PDUI_cutoff']) if not 'Coverage_pPAS_cutoff' in config_dict: print(" Coverage_pPAS_cutoff: Default parameter(5.0) was designated.") else: Coverage_pPAS_cutoff = float(config_dict['Coverage_pPAS_cutoff']) #Collect sample files num_group_1 = len(Group1_Tophat_aligned_file) num_group_2 = len(Group2_Tophat_aligned_file) All_Sample_files = Group1_Tophat_aligned_file[:] All_Sample_files.extend(Group2_Tophat_aligned_file) num_samples = len(All_Sample_files) #Prepare output directory d = os.path.dirname(output_directory) if not os.path.exists(d): os.makedirs(d) #Prepare temp directory temp_dir = d + '/tmp/' if not os.path.exists(temp_dir): os.makedirs(temp_dir) Output_all_prediction_file = output_directory + Output_result_file + '_result_temp.txt' Output_result = open(Output_all_prediction_file, 'w') #Load coverage now_time("Loading coverage...") All_samples_Target_3UTR_coverages, All_samples_sequencing_depths, UTR_events_dict = Load_Target_Wig_files(All_Sample_files, Annotated_3UTR_file) #Depth(Coverage) weight for each sample All_sample_coverage_weights = All_samples_sequencing_depths / np.mean(All_samples_sequencing_depths) now_time("Loading coverage finished.") #Prepare header information for output file first_line = ['Gene','Predicted_Proximal_APA','loci'] for i in range(num_group_1): curr_long_exp = 'A_%s_long_exp' % str(i+1) curr_short_exp = 'A_%s_short_exp' % str(i+1) curr_ratio = 'A_%s_PDUI' % str(i+1) curr_score = 'A_%s_EndClip_Score' % str(i+1) first_line.extend([curr_long_exp, curr_short_exp, curr_ratio, curr_score]) for i in range(num_group_2): curr_long_exp = 'B_%s_long_exp' % str(i+1) curr_short_exp = 'B_%s_short_exp' % str(i+1) curr_ratio = 'B_%s_PDUI' % str(i+1) curr_score = 'B_%s_EndClip_Score' % str(i+1) first_line.extend([curr_long_exp, curr_short_exp, curr_ratio, curr_score]) first_line.extend(['A_PDUI_mean','B_PDUI_mean','PDUI_Group_diff[B_PDUI_mean - A_PDUI_mean]','Fold-change[A_PDUI_mean/B_PDUI_mean]','Fold-change[B_short_exp_mean / A_short_exp_mean]','Fold-change[B_score_mean / A_score_mean]']) print("\t".join(first_line), end="\n", file=Output_result) #Test APA event for each 3UTR now_time("Testing APA events for each 3UTR region...") #Wig file preparation Wig_sample_files = [] for name in All_Sample_files: #name = 'data/NoCTRL_accepted_hits_PTEN_ELAVL1.bam.wig' name = os.path.basename(name) name = str(os.path.splitext(name)[0]) name = str(os.path.splitext(name)[0]) name += ".bg" name_file = open(name, 'w') print_write = "track type=bedGraph name=EndSnip_test_%s description=EndSnip_test_%s visibility=2 maxHeightPixels=40:40:20" % (name, name) print(print_write, end="\n",file=name_file) Wig_sample_files.append(name_file) #Bed file preparation Bed_sample_files = [] for name in All_Sample_files: #name = 'data/NoCTRL_accepted_hits_PTEN_ELAVL1.bam.wig' name = os.path.basename(name) name = str(os.path.splitext(name)[0]) name = str(os.path.splitext(name)[0]) name += ".bed" name_file = open(name, 'w') print_write = "track type=bed name=EndSnip_UTRIsoform_%s description=EndSnip_UTRIsoform_%s" % (name, name) print(print_write, end="\n",file=name_file) Bed_sample_files.append(name_file) for curr_3UTR_id in UTR_events_dict: #3UTR region information for each gene curr_3UTR_structure = UTR_events_dict[curr_3UTR_id] chrom = curr_3UTR_structure[0] region_start = curr_3UTR_structure[1] #region start region_end = curr_3UTR_structure[2] #region end curr_strand = curr_3UTR_structure[3] #strand UTR_pos = curr_3UTR_structure[4] #UTR position information #pA_site = curr_3UTR_structure[5].split('|') #pA_site list #pA_site = list(map(int,pA_site)) print(curr_3UTR_id) #If gene names exist in coverage dict(for each gene) if curr_3UTR_id in All_samples_Target_3UTR_coverages: #3UTR coverage for each gene curr_3UTR_coverage_wig = All_samples_Target_3UTR_coverages[curr_3UTR_id] #List of 3UTR coverage for each sample curr_3UTR_all_samples_bp_coverage = [] curr_3UTR_all_samples_bp_chrom_site = [] for curr_sample_curr_3UTR_coverage_wig in curr_3UTR_coverage_wig: #3UTR coverage for each sample bp_resolution_data = Convert_wig_into_bp_coverage(curr_sample_curr_3UTR_coverage_wig[0], #List of coverage curr_sample_curr_3UTR_coverage_wig[1], #List of 3UTR region(1-base) curr_strand) #strand #test = Convert_wig_into_bp_coverage(curr_sample_curr_3UTR_coverage_wig[0],curr_sample_curr_3UTR_coverage_wig[1],curr_strand) #test curr_3UTR_curr_samples_bp_coverage = bp_resolution_data[0] curr_3UTR_curr_samples_bp_chrom_site = bp_resolution_data[1] curr_3UTR_all_samples_bp_coverage.append(curr_3UTR_curr_samples_bp_coverage) #List of bp_coverage for each sample curr_3UTR_all_samples_bp_chrom_site.append(curr_3UTR_curr_samples_bp_chrom_site) #List of bp chromosome site for each sample #TODO: TEST: Coverage in 3'UTR region for PTEN, ELAVL1 # plt.plot(curr_3UTR_curr_samples_bp_coverage) # #plt.show() global test_name test_name = curr_3UTR_id.split('|')[1] # filename = "data/output_coverage_" + test_name + ".png" # plt.savefig(filename) #TODO: TEST: Coverage in 3'UTR region for PTEN, ELAVL1 #plt.close() #De novo identification of APA event for each 3UTR region curr_3UTR_all_samples_bp_coverage = np.array(curr_3UTR_all_samples_bp_coverage) #select_mean_squared_error, selected_break_point, UTR_abundance = De_Novo_3UTR_all_samples_bp_extimation(curr_3UTR_all_samples_bp_coverage, # region_start, # region_end, # curr_strand, # All_sample_coverage_weights, # Coverage_pPAS_cutoff, # test_name) #coverage_comparison_with_pA_site(curr_3UTR_all_samples_bp_coverage, curr_3UTR_all_samples_bp_chrom_site, region_start, region_end, curr_strand, All_sample_coverage_weights, Coverage_pPAS_cutoff, pA_site,test_name) de_novo_coverage_comparison_with_windows(curr_3UTR_all_samples_bp_coverage, curr_3UTR_all_samples_bp_chrom_site, region_start, region_end, curr_strand, All_sample_coverage_weights, Coverage_pPAS_cutoff, test_name, chrom, Wig_sample_files, Bed_sample_files, curr_3UTR_id, Output_result, num_group_1, num_group_2, UTR_pos) #Elapsed time end_time = time.time() - start_time end_h = int(end_time/3600) end_time -= 3600 * end_h if end_h < 10: end_h = "0" + str(end_h) else: end_h = str(end_h) end_m = int(end_time/60) end_time -= 60 * end_m if end_m < 10: end_m = "0" + str(end_m) else: end_m = str(end_m) end_s = int(end_time) if end_s < 10: end_s = "0" + str(end_s) else: end_s = str(end_s) run_time = "Completely finished: %s:%s:%s elapsed" % (end_h, end_m, end_s) now_time(run_time)
def Load_Target_Wig_files(All_Wig_files, UTR_Annotation_file): UTR_events_dict = {} All_Samples_Total_depth = [] #Load UTR Annotation file for line in open(UTR_Annotation_file, 'r'): fields = line.rstrip().split("\t") curr_chr = fields[0] region_start = fields[1] region_end = fields[2] name = fields[3] curr_strand = fields[5] #pA_site = fields[6] UTR_pos = "%s:%s-%s" % (curr_chr, region_start, region_end) #Define 3'UTR Annotation regions #end_shift = int(round(abs(int(region_start) - int(region_end)) * 0.2)) # TODO: 2割の領域でいいか確認する。 end_shift = 0 if curr_strand == '+': region_end = str(int(region_end) - end_shift) elif curr_strand == '-': region_start = str(int(region_start) + end_shift) else: sys.exit("ERROR: Strand column in your UTR annotation file is wrong...") region_start = int(region_start) + 1 #0-base => 1-base region_end = int(region_end) if (region_end - region_start) >= 500: #Min 3UTR length(Default: 500bp) #UTR_events_dict => [chrom, start, end, strand, UTR_position(chrom:start-end)] UTR_events_dict[name] = [curr_chr, region_start, region_end, curr_strand, UTR_pos] # TODO: Isoformごとに判断する場合を考慮に入れる # 終止コドンが異なるケースでは、Isoformごとに判断する。 # 3'UTR中でスプライシングを受けている場合も考慮する。 #Load coverage for all samples All_samples_extracted_3UTR_coverage_dict = {} for curr_wig_file in All_Wig_files: curr_sample_All_chroms_coverage_dict = {} num_line = 0 curr_sample_total_depth = 0 for line in open(curr_wig_file, 'r'): if '#' in line and line[0:3] != 'chr': continue fields = line.strip().split("\t") #Load wig file chrom_name = fields[0] region_start = int(fields[1]) region_end = int(fields[2]) read_depth = int(float(fields[-1])) #Initialize coverage data in each chromosome if chrom_name not in curr_sample_All_chroms_coverage_dict: curr_sample_All_chroms_coverage_dict[chrom_name] = [[0],[0]] #[[region_site], [depth]] #Add coverage data in each region on each chromosome if region_start > curr_sample_All_chroms_coverage_dict[chrom_name][0][-1]: #if gap region exists curr_sample_All_chroms_coverage_dict[chrom_name][0].append(region_start) #Region end => Region start #1-based curr_sample_All_chroms_coverage_dict[chrom_name][1].append(0) #Read depth => 0 curr_sample_All_chroms_coverage_dict[chrom_name][0].append(region_end) #Region end curr_sample_All_chroms_coverage_dict[chrom_name][1].append(read_depth) #Read depth #Total coverage and read count in each sample curr_sample_total_depth += read_depth * (region_end - region_start) num_line += 1 #Collect total depth for each sample curr_sample_All_chroms_coverage_dict[chrom_name][1].append(0) #slicing your list(extracted_coverage): 185 line All_Samples_Total_depth.append(curr_sample_total_depth) #Collection of total depth for each sample #print(curr_sample_All_chroms_coverage_dict[chrom_name][0][1:20]) #print(curr_sample_All_chroms_coverage_dict[chrom_name][1][1:20]) #print(len(curr_sample_All_chroms_coverage_dict[chrom_name][0])) #print(len(curr_sample_All_chroms_coverage_dict[chrom_name][1])) now_time(curr_wig_file + ": Total depth were loaded.") #Define each depth for each 3'UTR for curr_3UTR_event_id in UTR_events_dict.keys(): #Each transcript information curr_3UTR_structure = UTR_events_dict[curr_3UTR_event_id] curr_chr = curr_3UTR_structure[0] #Chromosome number region_start = curr_3UTR_structure[1] #3'UTR region start region_end = curr_3UTR_structure[2] #3'UTR region end #Call current chromosome from dictionary if curr_chr in curr_sample_All_chroms_coverage_dict.keys(): #Region and Depth for current chromosome curr_chr_coverage = curr_sample_All_chroms_coverage_dict[curr_chr] #TEST: #Raw_data #chrom_site = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] #chr_coverage = [1, 4, 4, 0, 0, 0, 1, 2, 2, 0] #Input_data #chrom_site = [0, 1, 3, 6, 7, 9, 10] #chr_coverage = [0, 1, 4, 0, 1, 2, 0] #from bisect import bisect #curr_chr_coverage = [[0, 1, 3, 6, 7, 9, 10], [0, 1, 4, 0, 1, 2, 0]] #from bisect import bisect #curr_chr_coverage = [[1,10,20,30,40,50,60], [1,10,10,10,30,30,50]] #[[Chrom_site],[Coverage]] #NO1: #region_start = 1 #region_end = 60 #NO2: #region_start = 1 #region_end = 55 #NO3: #region_start = 5 #region_end = 60 #NO4: #region_start = 5 #region_end = 55 #NO5: #region_start = 70 #region_end = 90 left_region_index = bisect(curr_chr_coverage[0], region_start) #Insertion site(index) of region start right_region_index = bisect(curr_chr_coverage[0], region_end) #Insertion site(index) of region end extracted_3UTR_region = [] extracted_coverage = [] #In the case of 0 coverage, if left_region_index == right_region_index: extracted_3UTR_region = [region_start, region_end] extracted_coverage = [0, 0] elif int(curr_chr_coverage[0][left_region_index-1]) == int(region_start) and int(curr_chr_coverage[0][right_region_index-1]) == int(region_end): #print("1") #List of 3UTR region extracted_3UTR_region = curr_chr_coverage[0][left_region_index-1:right_region_index] #List of depth(coverage) in 3'UTR region extracted_coverage = curr_chr_coverage[1][left_region_index-1:right_region_index] elif int(curr_chr_coverage[0][left_region_index-1]) == int(region_start): #print("2") #List of 3UTR region extracted_3UTR_region = curr_chr_coverage[0][left_region_index-1:right_region_index] extracted_3UTR_region.append(region_end) #List of depth(coverage) in 3'UTR region extracted_coverage = curr_chr_coverage[1][left_region_index-1:right_region_index+1] elif int(curr_chr_coverage[0][right_region_index-1]) == int(region_end): #print("3") #List of 3UTR region extracted_3UTR_region = curr_chr_coverage[0][left_region_index:right_region_index] extracted_3UTR_region.insert(0,region_start) #List of depth(coverage) in 3'UTR region extracted_coverage = curr_chr_coverage[1][left_region_index:right_region_index] extracted_coverage.insert(0,curr_chr_coverage[1][left_region_index]) else: #print("4") #List of 3UTR region extracted_3UTR_region = curr_chr_coverage[0][left_region_index:right_region_index] extracted_3UTR_region.insert(0,region_start) extracted_3UTR_region.append(region_end) #List of depth(coverage) in 3'UTR region extracted_coverage = curr_chr_coverage[1][left_region_index:right_region_index+1] extracted_coverage.insert(0,curr_chr_coverage[1][left_region_index]) ''' #Example #Index: 0 1 2 3 4 5 6 #chrom_site = [1,10,20,30,40,50,60] #coverage = [1,10,10,10,30,30,50] #1bp => 1 #2-10bp => 10 #11-20bp => 10 #21-30bp => 10 #31-40bp => 30 #41-50bp => 30 #51-60bp => 50 ###1-60[1,7]### #bisect(chrom_site,1) => 1 => 0/0 #bisect(chrom_site,60) => 7 => 7/7 #chrom_site: [1,10,20,30,40,50,60] => [1,10,20,30,40,50,60] #coverage: [1,10,10,10,30,30,50] => [1,10,10,10,30,30,50] ###1-55[1,6]### #bisect(chrom_site,1) => 1 => 0/0 #bisect(chrom_site,55) => 6 => 6/7 #chrom_site: [1,10,20,30,40,50] => [1,10,20,30,40,50,"55"] #coverage: [1,10,10,10,30,30,50] => [1,10,10,10,30,30, 50] ###5-60[1,7]### #bisect(chrom_site,5) => 1 => 1/1 #bisect(chrom_site,60) => 7 => 7/7 #chrom_site: [10,20,30,40,50,60] => [ "5",10,20,30,40,50,60] #coverage: [10,10,10,30,30,50] => ["10",10,10,10,30,30,50] ###5-55[1,7]### #bisect(chrom_site,5) => 1 => 1/1 #bisect(chrom_site,55) => 6 => 6/7 #chrom_site: [10,20,30,40,50] => [ "5",10,20,30,40,50,"55"] #coverage: [10,10,10,30,30,50] => ["10",10,10,10,30,30, 50 ] ''' #List of depth(coverage) in 3'UTR region #extracted_coverage = curr_chr_coverage[1][left_region_index:right_region_index+1] #List of 3UTR region #extracted_3UTR_region = curr_chr_coverage[0][left_region_index:right_region_index] #extracted_3UTR_region.insert(0, region_start) #extracted_3UTR_region.append(region_end) ###Example: ###chrom_site = [0,10,20,30,40,50,60,70,80,90,100] ###trx_exp =[0,5,5,5,5,5,1,1,0,0,0] ###bisect(chrom_site, 15) ###[2] => [0,10 | 20,30,40,50,60,70,80,90,100] ###bisect(chrom_site, 85) ###[9] => [0,10,20,30,40,50,60,70,80 | 90,100] ###chrom_site[2:9] => [20,30,40,50,60,70,80] (7 items) => [15,20,30,40,50,60,70,80,85] (8 items) ###trx_exp[2:9+1] => [5,5,5,5,1,1,0,0] (8 items) #Initiate current 3UTR event id in All_samples_extracted_3UTR_coverage_dict if not curr_3UTR_event_id in All_samples_extracted_3UTR_coverage_dict: All_samples_extracted_3UTR_coverage_dict[curr_3UTR_event_id] = [] #Reserve Depth(Coverage) and 3UTR region information for each sample in dictonary All_samples_extracted_3UTR_coverage_dict[curr_3UTR_event_id].append([extracted_coverage, extracted_3UTR_region]) #Gene information(curr_3UTR_event_id) => #[ [[Coverage list 1], [3UTR region list 1]], [[Coverage list 2], [3UTR region list 2]], ... , [[Coverage list N], [3UTR region list N]] ] now_time(curr_wig_file + ": Each depth for each 3'UTR was loaded.") #Reserve Depth(Coverage) and 3UTR region information for each 3UTR region | Total depth in samples | 3UTR region information return All_samples_extracted_3UTR_coverage_dict, np.array(All_Samples_Total_depth), UTR_events_dict