##################### Cluster discordant read pairs that match TE ##################### # Cluster the list of AlignedReadPair objects that have one read overlapping a TE # according to the uniquely mapping non-TE genomic location # then pair a fwd and rev cluster if they are overlapping # for the clusters that can be paired, calculate the softclipped support and the core reads, which will indicate heterozygosity of predicted TE insertion # bed file to store the insertion regions, to use to query the bam file later to calculate the zygosity bed_file_name = output_prefix + ".insertion_regions.bed" bed_file_handle = open(bed_file_name, "w") print "generating clusters..." if len(read_pair_one_overlap_TE_list) == 0: print "there might be an error, no discordant reads mapped to a TE location. please check the gff file... are the chromosome names the same as in the reference?" sys.exit(2) cluster_list = ClusterList(read_pair_one_overlap_TE_list) # if not parallel: # (cluster_pairs, unpaired_fwd_clusters, unpaired_rev_clusters) = cluster_list.generate_clusters(verbose, psorted_bamfile_name, bed_file_handle, True, min_cluster_size) # all_clusters = [(cluster_pairs, unpaired_fwd_clusters, unpaired_rev_clusters)] ##parallel version: # else: # # empty string is psorted bed file name, and True is streaming # all_clusters = cluster_list.generate_clusters_parallel(verbose, num_CPUs, bin_size, "", bed_file_handle, True, min_cluster_size,output_prefix) # # bed_file_handle.close() ##### Alternative with low RAM because it prints out all the results at once and use the database file sqlite from before. all_clusters = cluster_list.generate_clusters_db(database_file, bin_size, output_prefix, "", verbose, bed_file_handle, True, min_cluster_size)
def run_jitterbug(psorted_cramfile_name, already_calc_discordant_reads, valid_discordant_reads_file_name, verbose, te_annot, te_seqs, library_name, num_sdev, output_prefix, TE_name_tag, parallel, num_CPUs, bin_size, min_mapq, generate_test_bam, print_extra_output, conf_lib_stats, mem, min_cluster_size, reference_genome): mem_debug = False # print te_annot # print min_mapq # NOTE: comment this later !!!!!!!!!!!!!!!! # sorted_bam_reader = BamReader(output_prefix + ".proper_pair.sorted.bam", output_prefix) if mem_debug: reportResource("1") print "processing " + psorted_cramfile_name if not output_prefix: output_prefix = psorted_cramfile_name # Make BamReader object with bam file of mapped and sorted reads psorted_cram_reader = CramReader(psorted_cramfile_name, output_prefix, reference_genome) if generate_test_bam: print "generating test bam" psorted_cram_reader.output_one_chr_reads() return None start_time = datetime.datetime.now() print "starting at %s" % (str(start_time)) if conf_lib_stats: # Get the mean and sdev of insert size from supplied config file stats = {} for line in open(conf_lib_stats): line = line.strip() (tag, val) = line.split("\t") stats[tag] = (float(val)) isize_mean = stats["fragment_length"] isize_sdev = stats["fragment_length_SD"] rlen_mean = stats["read_length"] rlen_sdev = stats["read_length_SD"] print "mean fragment length taken from config file: %.2f" % (isize_mean) print "standard deviation of fragment_length: %.2f" % (isize_sdev) print "mean read length: %.2f" % (rlen_mean) print "standard deviation of read length: %.2f" % (rlen_sdev) else: # Get the mean and sdev of insert size from the original bam file print "calculating mean insert size..." iterations = 1000000 (isize_mean, isize_sdev, rlen_mean, rlen_sdev) = psorted_cram_reader.calculate_mean_sdev_isize(iterations) print "mean fragment length over %d reads: %.2f" % (iterations, isize_mean) print "standard deviation of fragment_length: %.2f" % (isize_sdev) print "mean read length: %.2f" % (rlen_mean) print "standard deviation of read length: %.2f" % (rlen_sdev) stats_file = open(output_prefix + ".read_stats.txt", "w") stats_file.write("fragment_length\t%.2f\n" % (isize_mean)) stats_file.write("fragment_length_SD\t%.2f\n" % (isize_sdev)) stats_file.write("read_length\t%.2f\n" % (rlen_mean)) stats_file.write("read_length_SD\t%.2f" % (rlen_sdev)) stats_file.close() # if fragment sdev is much larger than expected, there might be a problem with the reads of the mapping. Set as default 0.1*fragment length as a reasonable guess. # This is necessary because aberrant values for sdev will mess up the interval overlap calculation and the filtering if isize_sdev > 0.2 * isize_mean: isize_sdev = 0.1 * isize_mean print "WARNING: fragment length standard deviation seems way too large to be realistic.\\n\ There is maybe something weird with the flags in your bam mapping, or a very large number of large SV \\n\ that are messing up the count.\\n\ Setting the stdev to 0.1*fragment_length = %.2f for downstream calculations" % isize_sdev time = datetime.datetime.now() print "elapsed time: " + str(time - start_time) ################# Find valid discordant reads in given sorted bam file ################ # This will print bam file(s) with the set(s) of valid discordant reads meaning # that the reads in apair are mapped to distances greater than expected or to # two different chromosomes, and with at least one read that is mapped uniquely # if strict_repetitive is TRUE, will print out two bam files: # <bam_file_name>.valid_discordant_pairs_strict_rep.bam which is all valid discordant # read pairs with exactly one uniquely mapping and one repetitively mapping read pair # # <bam_file_name>.valid_discordant_pairs.bam which are all discordant valid discordant # read pairs with two uniquely mapping reads # if strict_repetitive is TRUE, will output both sets to a file named # <bam_file_name>.valid_discordant_pairs.bam # if have already run the prgm and calculated the discordant reads, dont do it again and look for a file called # <bam_file_name>.valid_discordant_pairs or <bam_file_name>.valid_discordant_pairs_strict_rep depending on the value of -s if not already_calc_discordant_reads: if mem_debug: reportResource("2") valid_discordant_reads_file_name = output_prefix + ".valid_discordant_pairs.cram" database_file = output_prefix + "dbfile.sqlite" print "selecting discordant reads..." # this writes the bam file of discordant reads to disc to be used later, and return the counts of different types of reads (bam_stats, ref_lengths, ref_names) = psorted_cram_reader.select_discordant_reads_psorted(verbose, isize_mean, valid_discordant_reads_file_name) # print ref_names, ref_lengths coverage = (bam_stats["total_reads"] * rlen_mean) / sum(ref_lengths) filter_conf_file = open(output_prefix + ".filter_config.txt", "w") filter_conf_file.write("cluster_size\t2\t%d\n" % (5 * coverage)) filter_conf_file.write("span\t2\t%d\n" % isize_mean) filter_conf_file.write( "int_size\t%d\t%d\n" % (rlen_mean, 2 * (isize_mean + 2 * isize_sdev - (rlen_mean - rlen_sdev)))) filter_conf_file.write("softclipped\t2\t%d\n" % (5 * coverage)) filter_conf_file.write("pick_consistent\t0\t-1") filter_conf_file.close() cram_stats_file = open(output_prefix + ".cram_stats.txt", "w") for key, value in bam_stats.items(): cram_stats_file.write("%s\t%d\n" % (key, value)) if mem_debug: reportResource("3") cram_stats_file.close() time = datetime.datetime.now() if mem_debug: reportResource("4") print "elapsed time: " + str(time - start_time) # print "sorting proper pair bam file in the background... " # args = ["samtools", "sort", output_prefix + ".proper_pair.bam", output_prefix + ".proper_pair.sorted"] # proper_pair_sort = subprocess.Popen(args) else: print "using already selected discordant reads in %s" % (valid_discordant_reads_file_name) ################### Select valid discordant reads that match a TE ##################### # Of the valid discordant read pairs, select those that have exactly one side that # overlaps a TE. This will # return a list of AlignedReadPair objects # the interval size in calculated as the INSIDE interval between the two reads ?? is this true? , plus numsdev * the sd of the insert size print "selecting discordant read pairs where exactly one maps to a TE..." interval_size = isize_mean + num_sdev * isize_sdev if te_annot: discordant_bam_reader = CramReader(valid_discordant_reads_file_name, output_prefix, reference_genome) read_pair_one_overlap_TE_list = discordant_bam_reader.select_read_pair_one_overlap_TE_annot(te_annot, interval_size, min_mapq, database_file, bin_size) read_pair_one_overlap_TE_list = [1, 1] if not (print_extra_output or already_calc_discordant_reads): os.remove(valid_discordant_reads_file_name) else: # here you would map mate reads to TE sequences and whatnot. pass if mem_debug: reportResource("5") time = datetime.datetime.now() print "elapsed time: " + str(time - start_time) ######################## wait till the proper pair bam file is sorted, and index it ########################### ##################### Cluster discordant read pairs that match TE ##################### # Cluster the list of AlignedReadPair objects that have one read overlapping a TE # according to the uniquely mapping non-TE genomic location # then pair a fwd and rev cluster if they are overlapping # for the clusters that can be paired, calculate the softclipped support and the core reads, which will indicate heterozygosity of predicted TE insertion print "generating clusters..." if len(read_pair_one_overlap_TE_list) == 0: print "there might be an error, no discordant reads mapped to a TE location. please check the gff file... are the chromosome names the same as in the reference?" sys.exit(2) cluster_list = ClusterList(read_pair_one_overlap_TE_list) #### COMMENTED TO TAKE ADVANTAGE OF THE GENERATED DATABASE FILE### # if not parallel: # (cluster_pairs, unpaired_fwd_clusters, unpaired_rev_clusters, bed_string) = cluster_list.generate_clusters(verbose, psorted_bamfile_name, "", False, min_cluster_size) # all_clusters = [(cluster_pairs, unpaired_fwd_clusters, unpaired_rev_clusters)] ##parallel version: # else: # # last tow args are bed file handle and streaming, unnecessary and False, in this version # all_clusters = cluster_list.generate_clusters_parallel(verbose, num_CPUs, bin_size, psorted_bamfile_name, "", False, min_cluster_size) bed_file_name = output_prefix + ".insertion_regions.bed" bed_file_handle = open(bed_file_name, "w") all_clusters = cluster_list.generate_clusters_db(database_file, bin_size, output_prefix, "", verbose, bed_file_handle, True, min_cluster_size) all_clusters = list(load_pickle(output_prefix + 'all_clusters.pkl')) ins_regions_cram_name = output_prefix + ".insertion_regions.reads.cram" args = ["samtools", "view", "-h", "-C", "-T", reference_genome, "-L", bed_file_name, "-o", ins_regions_cram_name, psorted_cramfile_name] # open subprocess int_bed_reads_select = subprocess.Popen(args) # wait till it finishes outcode = int_bed_reads_select.wait() if outcode == 0: print "retrieving reads overlapping bed annots successful" # construct list of args args = ["samtools", "index", ins_regions_cram_name] # open subprocess int_bed_reads_index = subprocess.Popen(args) # wait till it finishes outcode = int_bed_reads_index.wait() if outcode == 0: print "indexing successful" else: print "indexing failed" else: command = "\t".join(args) print "retrieving reads overlapping bed annots failed! command: %s " % (command) sys.exit(1) insertion_regions_reads_cram = pysam.AlignmentFile(ins_regions_cram_name, mode="rc", reference_filename=reference_genome) for (cluster_pairs, fwd, rev, string) in all_clusters: for cluster_pair in cluster_pairs: try: reads = insertion_regions_reads_cram.fetch(cluster_pair.get_chr(), cluster_pair.get_insertion_int_start(), cluster_pair.get_insertion_int_end()) cluster_pair.calc_zygosity(reads) except: print "error calculating zygosity of: " print cluster_pair raise print "Done calculating zygosity of each cluster pair" del read_pair_one_overlap_TE_list gc.collect() if mem_debug: reportResource("5") time = datetime.datetime.now() print "elapsed time: " + str(time - start_time) ###################### print reads that were clustered to bam, output to gff and table ######################################## print "writing clustered reads to bam file, writing to gff and tables... " pair_gff_output_file = open(output_prefix + ".TE_insertions_paired_clusters.gff3", "w") pair_table_output_file = open(output_prefix + ".TE_insertions_paired_clusters.supporting_clusters.table", "w") pair_table_output_file.write(table_header(library_name, library_name, te_annot)) # if print_extra_output: # single_gff_output_file = open(output_prefix + ".TE_insertions_single_cluster.gff3", "w") # single_table_output_file = open(output_prefix + ".TE_insertions_single_cluster.supporting_clusters.table", "w") # single_table_output_file.write(table_header(library_name, library_name, te_annot)) print len(all_clusters) cluster_ID = 0 for (cluster_pairs, unpaired_fwd_clusters, unpaired_rev_clusters, strings) in all_clusters: # unpaired clusters are no longer reported # if print_extra_output: # for fwd_cluster in unpaired_fwd_clusters: # single_gff_output_file.write(fwd_cluster.to_gff(cluster_ID, library_name, TE_name_tag) + "\n") # single_table_output_file.write(fwd_cluster.to_table(cluster_ID, library_name)) # cluster_ID += 1 # for rev_cluster in unpaired_rev_clusters: # single_gff_output_file.write(rev_cluster.to_gff(cluster_ID, library_name, TE_name_tag) + "\n") # single_table_output_file.write(rev_cluster.to_table(cluster_ID, library_name)) # cluster_ID += 1 # print cluster_ID for cluster_pair in cluster_pairs: pair_gff_output_file.write(cluster_pair.to_gff(cluster_ID, library_name, TE_name_tag) + "\n") pair_table_output_file.write(cluster_pair.to_table(cluster_ID, library_name)) cluster_ID += 1 # clustered_reads_bam_file.close() time = datetime.datetime.now() print "elapsed time: " + str(time - start_time) pair_gff_output_file.close() pair_table_output_file.close() # if print_extra_output: # single_gff_output_file.close() # single_table_output_file.close() end_time = str(datetime.datetime.now()) print "done! at " + end_time run_stats = open(output_prefix + ".run_stats.txt", "w") run_stats.write("lib\t%s\n" % (library_name)) if not already_calc_discordant_reads: run_stats.write("coverage\t%s\n" % (coverage)) run_stats.write("runtime\t%s\n" % (datetime.datetime.now() - start_time)) run_stats.write("numCPUs\t%s\n" % (num_CPUs)) run_stats.close()
def run_jitterbug(psorted_bamfile_name, already_calc_discordant_reads, valid_discordant_reads_file_name, verbose, te_annot, \ te_seqs, library_name, num_sdev, output_prefix, TE_name_tag, parallel, num_CPUs, bin_size, min_mapq, generate_test_bam, print_extra_output, conf_lib_stats, mem, min_cluster_size): mem_debug = False # print te_annot # print min_mapq #NOTE: comment this later !!!!!!!!!!!!!!!! #sorted_bam_reader = BamReader(output_prefix + ".proper_pair.sorted.bam", output_prefix) if mem_debug: reportResource("1") print "processing " + psorted_bamfile_name if not output_prefix: output_prefix = psorted_bamfile_name # Make BamReader object with bam file of mapped and sorted reads # NOTE: uncomment this later !!!!!!!!!!!!! psorted_bam_reader = BamReader(psorted_bamfile_name, output_prefix) if generate_test_bam: print "generating test bam" psorted_bam_reader.output_one_chr_reads() return None start_time = datetime.datetime.now() print "starting at %s" % (str(start_time)) if conf_lib_stats: # Get the mean and sdev of insert size from supplied config file stats = {} for line in open(conf_lib_stats): line = line.strip() (tag, val) = line.split("\t") stats[tag] = (float(val)) isize_mean = stats["fragment_length"] isize_sdev = stats["fragment_length_SD"] rlen_mean = stats["read_length"] rlen_sdev = stats["read_length_SD"] print "mean fragment length taken from config file: %.2f" % (isize_mean) print "standard deviation of fragment_length: %.2f" % (isize_sdev) print "mean read length: %.2f" % (rlen_mean) print "standard deviation of read length: %.2f" % (rlen_sdev) else: # Get the mean and sdev of insert size from the original bam file print "calculating mean insert size..." iterations = 1000000 (isize_mean, isize_sdev, rlen_mean, rlen_sdev) = psorted_bam_reader.calculate_mean_sdev_isize(iterations) print "mean fragment length over %d reads: %.2f" % (iterations, isize_mean) print "standard deviation of fragment_length: %.2f" % (isize_sdev) print "mean read length: %.2f" % (rlen_mean) print "standard deviation of read length: %.2f" % (rlen_sdev) stats_file = open(output_prefix + ".read_stats.txt", "w") stats_file.write("fragment_length\t%.2f\n" % (isize_mean)) stats_file.write("fragment_length_SD\t%.2f\n" % (isize_sdev)) stats_file.write("read_length\t%.2f\n" % (rlen_mean)) stats_file.write("read_length_SD\t%.2f" % (rlen_sdev)) stats_file.close() #if fragment sdev is much larger than expected, there might be a problem with the reads of the mapping. Set as default 0.1*fragment length as a reasonable guess. # This is necessary because aberrant values for sdev will mess up the interval overlap calculation and the filtering if isize_sdev > 0.2*isize_mean: isize_sdev = 0.1*isize_mean print "WARNING: fragment length standard deviation seems way too large to be realistic.\\n\ There is maybe something weird with the flags in your bam mapping, or a very large number of large SV \\n\ that are messing up the count.\\n\ Setting the stdev to 0.1*fragment_length = %.2f for downstream calculations" % isize_sdev time = datetime.datetime.now() print "elapsed time: " + str(time - start_time) ################# Find valid discordant reads in given sorted bam file ################ # This will print bam file(s) with the set(s) of valid discordant reads meaning # that the reads in apair are mapped to distances greater than expected or to # two different chromosomes, and with at least one read that is mapped uniquely # if strict_repetitive is TRUE, will print out two bam files: # <bam_file_name>.valid_discordant_pairs_strict_rep.bam which is all valid discordant # read pairs with exactly one uniquely mapping and one repetitively mapping read pair # # <bam_file_name>.valid_discordant_pairs.bam which are all discordant valid discordant # read pairs with two uniquely mapping reads # if strict_repetitive is TRUE, will output both sets to a file named # <bam_file_name>.valid_discordant_pairs.bam #if have already run the prgm and calculated the discordant reads, dont do it again and look for a file called #<bam_file_name>.valid_discordant_pairs or <bam_file_name>.valid_discordant_pairs_strict_rep depending on the value of -s if not already_calc_discordant_reads: if mem_debug: reportResource("2") valid_discordant_reads_file_name = output_prefix + ".valid_discordant_pairs.bam" database_file=output_prefix+"dbfile.sqlite" print "selecting discordant reads..." #this writes the bam file of discordant reads to disc to be used later, and return the counts of different types of reads (bam_stats, ref_lengths, ref_names) = psorted_bam_reader.select_discordant_reads_psorted( verbose, isize_mean, valid_discordant_reads_file_name) #print ref_names, ref_lengths coverage = (bam_stats["total_reads"] * rlen_mean )/ sum(ref_lengths) filter_conf_file = open(output_prefix + ".filter_config.txt", "w") filter_conf_file.write("cluster_size\t2\t%d\n" % (5*coverage)) filter_conf_file.write("span\t2\t%d\n" % isize_mean) filter_conf_file.write("int_size\t%d\t%d\n" % (rlen_mean, 2*(isize_mean + 2*isize_sdev - (rlen_mean - rlen_sdev))) ) filter_conf_file.write("softclipped\t2\t%d\n" % (5*coverage)) filter_conf_file.write("pick_consistent\t0\t-1") filter_conf_file.close() bam_stats_file = open(output_prefix + ".bam_stats.txt", "w") for key, value in bam_stats.items(): bam_stats_file.write("%s\t%d\n" % (key, value)) if mem_debug: reportResource("3") bam_stats_file.close() time = datetime.datetime.now() if mem_debug: reportResource("4") print "elapsed time: " + str(time - start_time) # print "sorting proper pair bam file in the background... " # args = ["samtools", "sort", output_prefix + ".proper_pair.bam", output_prefix + ".proper_pair.sorted"] # proper_pair_sort = subprocess.Popen(args) else: print "using already selected discordant reads in %s" % (valid_discordant_reads_file_name) ################### Select valid discordant reads that match a TE ##################### # Of the valid discordant read pairs, select those that have exactly one side that # overlaps a TE. This will # return a list of AlignedReadPair objects # the interval size in calculated as the INSIDE interval between the two reads ?? is this true? , plus numsdev * the sd of the insert size print "selecting discordant read pairs where exactly one maps to a TE..." interval_size = isize_mean + num_sdev * isize_sdev if te_annot: discordant_bam_reader = BamReader(valid_discordant_reads_file_name, output_prefix) read_pair_one_overlap_TE_list = discordant_bam_reader.select_read_pair_one_overlap_TE_annot(te_annot, interval_size, min_mapq,database_file,bin_size) read_pair_one_overlap_TE_list=[1,1] if not (print_extra_output or already_calc_discordant_reads): os.remove(valid_discordant_reads_file_name) else: #here you would map mate reads to TE sequences and whatnot. pass if mem_debug: reportResource("5") time = datetime.datetime.now() print "elapsed time: " + str(time - start_time) ######################## wait till the proper pair bam file is sorted, and index it ########################### ##################### Cluster discordant read pairs that match TE ##################### # Cluster the list of AlignedReadPair objects that have one read overlapping a TE # according to the uniquely mapping non-TE genomic location # then pair a fwd and rev cluster if they are overlapping # for the clusters that can be paired, calculate the softclipped support and the core reads, which will indicate heterozygosity of predicted TE insertion print "generating clusters..." if len(read_pair_one_overlap_TE_list) == 0: print "there might be an error, no discordant reads mapped to a TE location. please check the gff file... are the chromosome names the same as in the reference?" sys.exit(2) cluster_list = ClusterList(read_pair_one_overlap_TE_list) #### COMMENTED TO TAKE ADVANTAGE OF THE GENERATED DATABASE FILE### #if not parallel: # (cluster_pairs, unpaired_fwd_clusters, unpaired_rev_clusters, bed_string) = cluster_list.generate_clusters(verbose, psorted_bamfile_name, "", False, min_cluster_size) # all_clusters = [(cluster_pairs, unpaired_fwd_clusters, unpaired_rev_clusters)] ##parallel version: #else: # # last tow args are bed file handle and streaming, unnecessary and False, in this version # all_clusters = cluster_list.generate_clusters_parallel(verbose, num_CPUs, bin_size, psorted_bamfile_name, "", False, min_cluster_size) bed_file_name = output_prefix + ".insertion_regions.bed" bed_file_handle = open(bed_file_name, "w") all_clusters = cluster_list.generate_clusters_db(database_file,bin_size,output_prefix,"", verbose, bed_file_handle, True, min_cluster_size) all_clusters=list(load_pickle(output_prefix+'all_clusters.pkl')) del read_pair_one_overlap_TE_list gc.collect() if mem_debug: reportResource("5") time = datetime.datetime.now() print "elapsed time: " + str(time - start_time) ###################### print reads that were clustered to bam, output to gff and table ######################################## print "writing clustered reads to bam file, writing to gff and tables... " pair_gff_output_file = open(output_prefix + ".TE_insertions_paired_clusters.gff3", "w") pair_table_output_file = open(output_prefix + ".TE_insertions_paired_clusters.supporting_clusters.table", "w") pair_table_output_file.write(table_header(library_name, library_name, te_annot)) # if print_extra_output: # single_gff_output_file = open(output_prefix + ".TE_insertions_single_cluster.gff3", "w") # single_table_output_file = open(output_prefix + ".TE_insertions_single_cluster.supporting_clusters.table", "w") # single_table_output_file.write(table_header(library_name, library_name, te_annot)) print len(all_clusters) cluster_ID = 0 for (cluster_pairs, unpaired_fwd_clusters, unpaired_rev_clusters,strings) in all_clusters: # unpaired clusters are no longer reported # if print_extra_output: # for fwd_cluster in unpaired_fwd_clusters: # single_gff_output_file.write(fwd_cluster.to_gff(cluster_ID, library_name, TE_name_tag) + "\n") # single_table_output_file.write(fwd_cluster.to_table(cluster_ID, library_name)) # cluster_ID += 1 # for rev_cluster in unpaired_rev_clusters: # single_gff_output_file.write(rev_cluster.to_gff(cluster_ID, library_name, TE_name_tag) + "\n") # single_table_output_file.write(rev_cluster.to_table(cluster_ID, library_name)) # cluster_ID += 1 #print cluster_ID for cluster_pair in cluster_pairs: pair_gff_output_file.write(cluster_pair.to_gff(cluster_ID, library_name, TE_name_tag) + "\n") pair_table_output_file.write(cluster_pair.to_table(cluster_ID, library_name)) cluster_ID += 1 #clustered_reads_bam_file.close() time = datetime.datetime.now() print "elapsed time: " + str(time - start_time) pair_gff_output_file.close() pair_table_output_file.close() # if print_extra_output: # single_gff_output_file.close() # single_table_output_file.close() end_time = str(datetime.datetime.now()) print "done! at " + end_time run_stats = open(output_prefix + ".run_stats.txt", "w") run_stats.write("lib\t%s\n" % (library_name)) if not already_calc_discordant_reads: run_stats.write("coverage\t%s\n" % (coverage)) run_stats.write("runtime\t%s\n" % ( datetime.datetime.now() - start_time)) run_stats.write("numCPUs\t%s\n" % (num_CPUs)) run_stats.close()
##################### Cluster discordant read pairs that match TE ##################### # Cluster the list of AlignedReadPair objects that have one read overlapping a TE # according to the uniquely mapping non-TE genomic location # then pair a fwd and rev cluster if they are overlapping # for the clusters that can be paired, calculate the softclipped support and the core reads, which will indicate heterozygosity of predicted TE insertion # bed file to store the insertion regions, to use to query the bam file later to calculate the zygosity bed_file_name = output_prefix + ".insertion_regions.bed" bed_file_handle = open(bed_file_name, "w") print "generating clusters..." if len(read_pair_one_overlap_TE_list) == 0: print "there might be an error, no discordant reads mapped to a TE location. please check the gff file... are the chromosome names the same as in the reference?" sys.exit(2) cluster_list = ClusterList(read_pair_one_overlap_TE_list) #if not parallel: # (cluster_pairs, unpaired_fwd_clusters, unpaired_rev_clusters) = cluster_list.generate_clusters(verbose, psorted_bamfile_name, bed_file_handle, True, min_cluster_size) # all_clusters = [(cluster_pairs, unpaired_fwd_clusters, unpaired_rev_clusters)] ##parallel version: #else: # # empty string is psorted bed file name, and True is streaming # all_clusters = cluster_list.generate_clusters_parallel(verbose, num_CPUs, bin_size, "", bed_file_handle, True, min_cluster_size,output_prefix) # # bed_file_handle.close() ##### Alternative with low RAM because it prints out all the results at once and use the database file sqlite from before. all_clusters = cluster_list.generate_clusters_db(database_file,bin_size,output_prefix,"", verbose, bed_file_handle, True, min_cluster_size) ### retrieve reads in the intervals where insertions were predicted and use them to calculate allelic frequency (zygosity) of the predictions ins_regions_bam_name = output_prefix + ".insertion_regions.reads.bam"