Python ClusterList, oecluster Examples

Programming Language: Python

Class/Type: ClusterList

Examples at hotexamples.com: 4

Python ClusterList - 4 examples found. These are the top rated real world Python examples of ClusterList from package oecluster extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

ClusterList(2)

generate_clusters_db(2)

generate_clusters(1)

generate_clusters_parallel(1)

Example #1

Show file

File: Run_TE_ID_reseq_streaming.py Project: ebete/jitterbug

    ##################### Cluster discordant read pairs that match TE #####################
    # Cluster the list of AlignedReadPair objects that have one read overlapping a TE
    # according to the uniquely mapping non-TE genomic location
    # then pair a fwd and rev cluster if they are overlapping
    # for the clusters that can be paired, calculate the softclipped support and the core reads, which will indicate heterozygosity of predicted TE insertion

    # bed file to store the insertion regions, to use to query the bam file later to calculate the zygosity
    bed_file_name = output_prefix + ".insertion_regions.bed"
    bed_file_handle = open(bed_file_name, "w")

    print "generating clusters..."
    if len(read_pair_one_overlap_TE_list) == 0:
        print "there might be an error, no discordant reads mapped to a TE location. please check the gff file... are the chromosome names the same as in the reference?"
        sys.exit(2)
    cluster_list = ClusterList(read_pair_one_overlap_TE_list)

    # if not parallel:
    #    (cluster_pairs, unpaired_fwd_clusters, unpaired_rev_clusters) = cluster_list.generate_clusters(verbose, psorted_bamfile_name, bed_file_handle, True, min_cluster_size)
    #    all_clusters = [(cluster_pairs, unpaired_fwd_clusters, unpaired_rev_clusters)]
    ##parallel version:
    # else:
    #    # empty string is psorted bed file name, and True is streaming
    #    all_clusters = cluster_list.generate_clusters_parallel(verbose, num_CPUs, bin_size, "", bed_file_handle, True, min_cluster_size,output_prefix)
    #    # bed_file_handle.close()

    ##### Alternative with low RAM because it prints out all the results at once and use the database file sqlite from before.
    all_clusters = cluster_list.generate_clusters_db(database_file, bin_size,
                                                     output_prefix, "",
                                                     verbose, bed_file_handle,
                                                     True, min_cluster_size)

Example #2

Show file

def run_jitterbug(psorted_cramfile_name, already_calc_discordant_reads, valid_discordant_reads_file_name, verbose,
                  te_annot, te_seqs, library_name, num_sdev, output_prefix, TE_name_tag, parallel, num_CPUs, bin_size,
                  min_mapq, generate_test_bam, print_extra_output, conf_lib_stats, mem, min_cluster_size,
                  reference_genome):
    mem_debug = False

    # print te_annot
    # print min_mapq

    # NOTE: comment this later !!!!!!!!!!!!!!!!
    # sorted_bam_reader = BamReader(output_prefix + ".proper_pair.sorted.bam", output_prefix)

    if mem_debug:
        reportResource("1")

    print "processing " + psorted_cramfile_name
    if not output_prefix:
        output_prefix = psorted_cramfile_name

    # Make BamReader object with bam file of mapped and sorted reads
    psorted_cram_reader = CramReader(psorted_cramfile_name, output_prefix, reference_genome)

    if generate_test_bam:
        print "generating test bam"
        psorted_cram_reader.output_one_chr_reads()
        return None

    start_time = datetime.datetime.now()
    print "starting at %s" % (str(start_time))

    if conf_lib_stats:
        # Get the mean and sdev of insert size from supplied config file
        stats = {}
        for line in open(conf_lib_stats):
            line = line.strip()
            (tag, val) = line.split("\t")
            stats[tag] = (float(val))
        isize_mean = stats["fragment_length"]
        isize_sdev = stats["fragment_length_SD"]
        rlen_mean = stats["read_length"]
        rlen_sdev = stats["read_length_SD"]
        print "mean fragment length taken from config file: %.2f" % (isize_mean)
        print "standard deviation of fragment_length: %.2f" % (isize_sdev)
        print "mean read length: %.2f" % (rlen_mean)
        print "standard deviation of read length: %.2f" % (rlen_sdev)

    else:
        # Get the mean and sdev of insert size from the original bam file
        print "calculating mean insert size..."
        iterations = 1000000
        (isize_mean, isize_sdev, rlen_mean, rlen_sdev) = psorted_cram_reader.calculate_mean_sdev_isize(iterations)
        print "mean fragment length over %d reads: %.2f" % (iterations, isize_mean)
        print "standard deviation of fragment_length: %.2f" % (isize_sdev)
        print "mean read length: %.2f" % (rlen_mean)
        print "standard deviation of read length: %.2f" % (rlen_sdev)

        stats_file = open(output_prefix + ".read_stats.txt", "w")
        stats_file.write("fragment_length\t%.2f\n" % (isize_mean))
        stats_file.write("fragment_length_SD\t%.2f\n" % (isize_sdev))
        stats_file.write("read_length\t%.2f\n" % (rlen_mean))
        stats_file.write("read_length_SD\t%.2f" % (rlen_sdev))

        stats_file.close()

        # if fragment sdev is much larger than expected, there might be a problem with the reads of the mapping. Set as default 0.1*fragment length as a reasonable guess.
        # This is necessary because aberrant values for sdev will mess up the interval overlap calculation and the filtering 
        if isize_sdev > 0.2 * isize_mean:
            isize_sdev = 0.1 * isize_mean
            print "WARNING: fragment length standard deviation seems way too large to be realistic.\\n\
            There is maybe something weird with the flags in your bam mapping, or a very large number of large SV \\n\
            that are messing up the count.\\n\
            Setting the stdev to 0.1*fragment_length = %.2f for downstream calculations" % isize_sdev

        time = datetime.datetime.now()
        print "elapsed time: " + str(time - start_time)

    ################# Find valid discordant reads in given sorted bam file ################
    # This will print bam file(s) with the set(s) of valid discordant reads meaning
    # that the reads in  apair are mapped to distances greater than expected or to
    # two different chromosomes, and with at least one read that is mapped uniquely

    # if strict_repetitive is TRUE, will print out two bam files:
    # <bam_file_name>.valid_discordant_pairs_strict_rep.bam which is all valid discordant
    # read pairs with exactly one uniquely mapping and one repetitively mapping read pair
    #
    # <bam_file_name>.valid_discordant_pairs.bam which are all discordant valid discordant
    # read pairs with two uniquely mapping reads

    # if strict_repetitive is TRUE, will output both sets to a file named
    # <bam_file_name>.valid_discordant_pairs.bam

    # if have already run the prgm and calculated the discordant reads, dont do it again and look for a file called
    # <bam_file_name>.valid_discordant_pairs or <bam_file_name>.valid_discordant_pairs_strict_rep depending on the value of -s
    if not already_calc_discordant_reads:

        if mem_debug:
            reportResource("2")

        valid_discordant_reads_file_name = output_prefix + ".valid_discordant_pairs.cram"
        database_file = output_prefix + "dbfile.sqlite"

        print "selecting discordant reads..."
        # this writes the bam file of discordant reads to disc to be used later, and return the counts of different types of reads
        (bam_stats, ref_lengths, ref_names) = psorted_cram_reader.select_discordant_reads_psorted(verbose, isize_mean,
                                                                                                  valid_discordant_reads_file_name)
        # print ref_names, ref_lengths
        coverage = (bam_stats["total_reads"] * rlen_mean) / sum(ref_lengths)

        filter_conf_file = open(output_prefix + ".filter_config.txt", "w")
        filter_conf_file.write("cluster_size\t2\t%d\n" % (5 * coverage))
        filter_conf_file.write("span\t2\t%d\n" % isize_mean)
        filter_conf_file.write(
            "int_size\t%d\t%d\n" % (rlen_mean, 2 * (isize_mean + 2 * isize_sdev - (rlen_mean - rlen_sdev))))
        filter_conf_file.write("softclipped\t2\t%d\n" % (5 * coverage))
        filter_conf_file.write("pick_consistent\t0\t-1")

        filter_conf_file.close()

        cram_stats_file = open(output_prefix + ".cram_stats.txt", "w")
        for key, value in bam_stats.items():
            cram_stats_file.write("%s\t%d\n" % (key, value))

        if mem_debug:
            reportResource("3")
        cram_stats_file.close()

        time = datetime.datetime.now()
        if mem_debug:
            reportResource("4")
        print "elapsed time: " + str(time - start_time)


    #        print "sorting proper pair bam file in the background... "
    #        args = ["samtools", "sort", output_prefix + ".proper_pair.bam", output_prefix + ".proper_pair.sorted"]
    #        proper_pair_sort = subprocess.Popen(args)
    else:
        print "using already selected discordant reads in %s" % (valid_discordant_reads_file_name)

    ################### Select valid discordant reads that match a TE #####################
    # Of the valid discordant read pairs, select those that have exactly one side that
    # overlaps a TE. This will
    # return a list of AlignedReadPair objects
    # the interval size in calculated as the INSIDE interval between the two reads ?? is this true? , plus numsdev * the sd of the insert size

    print "selecting discordant read pairs where exactly one maps to a TE..."
    interval_size = isize_mean + num_sdev * isize_sdev

    if te_annot:
        discordant_bam_reader = CramReader(valid_discordant_reads_file_name, output_prefix, reference_genome)
        read_pair_one_overlap_TE_list = discordant_bam_reader.select_read_pair_one_overlap_TE_annot(te_annot,
                                                                                                    interval_size,
                                                                                                    min_mapq,
                                                                                                    database_file,
                                                                                                    bin_size)
        read_pair_one_overlap_TE_list = [1, 1]
        if not (print_extra_output or already_calc_discordant_reads):
            os.remove(valid_discordant_reads_file_name)

    else:

        # here you would map mate reads to TE sequences and whatnot.
        pass

    if mem_debug:
        reportResource("5")
    time = datetime.datetime.now()
    print "elapsed time: " + str(time - start_time)

    ######################## wait till the proper pair bam file is sorted, and index it ###########################

    ##################### Cluster discordant read pairs that match TE #####################
    # Cluster the list of AlignedReadPair objects that have one read overlapping a TE
    # according to the uniquely mapping non-TE genomic location
    # then pair a fwd and rev cluster if they are overlapping
    # for the clusters that can be paired, calculate the softclipped support and the core reads, which will indicate heterozygosity of predicted TE insertion

    print "generating clusters..."
    if len(read_pair_one_overlap_TE_list) == 0:
        print "there might be an error, no discordant reads mapped to a TE location. please check the gff file... are the chromosome names the same as in the reference?"
        sys.exit(2)
    cluster_list = ClusterList(read_pair_one_overlap_TE_list)

    ####    COMMENTED TO TAKE ADVANTAGE OF THE GENERATED DATABASE FILE###
    # if not parallel:
    #    (cluster_pairs, unpaired_fwd_clusters, unpaired_rev_clusters, bed_string) = cluster_list.generate_clusters(verbose, psorted_bamfile_name, "", False, min_cluster_size)
    #    all_clusters = [(cluster_pairs, unpaired_fwd_clusters, unpaired_rev_clusters)]
    ##parallel version:
    # else:
    #    # last tow args are bed file handle and streaming, unnecessary and False, in this version 
    #    all_clusters = cluster_list.generate_clusters_parallel(verbose, num_CPUs, bin_size, psorted_bamfile_name, "", False, min_cluster_size)

    bed_file_name = output_prefix + ".insertion_regions.bed"
    bed_file_handle = open(bed_file_name, "w")
    all_clusters = cluster_list.generate_clusters_db(database_file, bin_size, output_prefix, "", verbose,
                                                     bed_file_handle, True, min_cluster_size)
    all_clusters = list(load_pickle(output_prefix + 'all_clusters.pkl'))

    ins_regions_cram_name = output_prefix + ".insertion_regions.reads.cram"
    args = ["samtools", "view", "-h", "-C", "-T", reference_genome, "-L", bed_file_name, "-o", ins_regions_cram_name,
            psorted_cramfile_name]
    # open subprocess 
    int_bed_reads_select = subprocess.Popen(args)
    # wait till it finishes
    outcode = int_bed_reads_select.wait()
    if outcode == 0:
        print "retrieving reads overlapping bed annots successful"
        # construct list of args 
        args = ["samtools", "index", ins_regions_cram_name]
        # open subprocess 
        int_bed_reads_index = subprocess.Popen(args)
        # wait till it finishes
        outcode = int_bed_reads_index.wait()
        if outcode == 0:
            print "indexing successful"
        else:
            print "indexing failed"
    else:
        command = "\t".join(args)
        print "retrieving reads overlapping bed annots failed! command: %s " % (command)
        sys.exit(1)
    insertion_regions_reads_cram = pysam.AlignmentFile(ins_regions_cram_name, mode="rc",
                                                       reference_filename=reference_genome)
    for (cluster_pairs, fwd, rev, string) in all_clusters:
        for cluster_pair in cluster_pairs:
            try:
                reads = insertion_regions_reads_cram.fetch(cluster_pair.get_chr(),
                                                           cluster_pair.get_insertion_int_start(),
                                                           cluster_pair.get_insertion_int_end())
                cluster_pair.calc_zygosity(reads)
            except:
                print "error calculating zygosity of: "
                print cluster_pair
                raise
    print "Done calculating zygosity of each cluster pair"

    del read_pair_one_overlap_TE_list
    gc.collect()
    if mem_debug:
        reportResource("5")
    time = datetime.datetime.now()
    print "elapsed time: " + str(time - start_time)

    ###################### print reads that were clustered to bam, output to gff and table ########################################

    print "writing clustered reads to bam file, writing to gff and tables... "

    pair_gff_output_file = open(output_prefix + ".TE_insertions_paired_clusters.gff3", "w")
    pair_table_output_file = open(output_prefix + ".TE_insertions_paired_clusters.supporting_clusters.table", "w")
    pair_table_output_file.write(table_header(library_name, library_name, te_annot))

    # if print_extra_output: 
    #     single_gff_output_file = open(output_prefix + ".TE_insertions_single_cluster.gff3", "w")
    #     single_table_output_file = open(output_prefix + ".TE_insertions_single_cluster.supporting_clusters.table", "w")
    #     single_table_output_file.write(table_header(library_name, library_name, te_annot))

    print len(all_clusters)
    cluster_ID = 0
    for (cluster_pairs, unpaired_fwd_clusters, unpaired_rev_clusters, strings) in all_clusters:

        # unpaired clusters are no longer reported
        # if print_extra_output:
        #     for fwd_cluster in unpaired_fwd_clusters:
        #         single_gff_output_file.write(fwd_cluster.to_gff(cluster_ID, library_name, TE_name_tag) + "\n")
        #         single_table_output_file.write(fwd_cluster.to_table(cluster_ID, library_name))

        #         cluster_ID += 1

        #     for rev_cluster in unpaired_rev_clusters:
        #         single_gff_output_file.write(rev_cluster.to_gff(cluster_ID, library_name, TE_name_tag) + "\n")
        #         single_table_output_file.write(rev_cluster.to_table(cluster_ID, library_name))
        #         cluster_ID += 1

        # print cluster_ID
        for cluster_pair in cluster_pairs:
            pair_gff_output_file.write(cluster_pair.to_gff(cluster_ID, library_name, TE_name_tag) + "\n")
            pair_table_output_file.write(cluster_pair.to_table(cluster_ID, library_name))
            cluster_ID += 1

    # clustered_reads_bam_file.close()
    time = datetime.datetime.now()
    print "elapsed time: " + str(time - start_time)

    pair_gff_output_file.close()
    pair_table_output_file.close()

    # if print_extra_output:
    #     single_gff_output_file.close()
    #     single_table_output_file.close()

    end_time = str(datetime.datetime.now())
    print "done! at " + end_time

    run_stats = open(output_prefix + ".run_stats.txt", "w")
    run_stats.write("lib\t%s\n" % (library_name))
    if not already_calc_discordant_reads:
        run_stats.write("coverage\t%s\n" % (coverage))
    run_stats.write("runtime\t%s\n" % (datetime.datetime.now() - start_time))
    run_stats.write("numCPUs\t%s\n" % (num_CPUs))
    run_stats.close()

Example #3

Show file

File: Run_TE_ID_reseq.py Project: MischaLundberg/jitterbug

def run_jitterbug(psorted_bamfile_name, already_calc_discordant_reads, valid_discordant_reads_file_name, verbose, te_annot, \
    te_seqs, library_name, num_sdev, output_prefix, TE_name_tag, parallel, num_CPUs, bin_size, min_mapq, generate_test_bam, print_extra_output, conf_lib_stats, mem, min_cluster_size):


    mem_debug = False

    # print te_annot
    # print min_mapq

    

    #NOTE: comment this later !!!!!!!!!!!!!!!!
    #sorted_bam_reader = BamReader(output_prefix + ".proper_pair.sorted.bam", output_prefix)

    if mem_debug:
        reportResource("1")

    print "processing " + psorted_bamfile_name
    if not output_prefix:
        output_prefix = psorted_bamfile_name



    # Make BamReader object with bam file of mapped and sorted reads
    # NOTE: uncomment this later !!!!!!!!!!!!!
    psorted_bam_reader = BamReader(psorted_bamfile_name, output_prefix)

    if generate_test_bam:
            print "generating test bam"
            psorted_bam_reader.output_one_chr_reads()
            return None

    start_time = datetime.datetime.now()
    print "starting at %s" % (str(start_time))

    if conf_lib_stats:
        # Get the mean and sdev of insert size from supplied config file
        stats = {}
        for line in open(conf_lib_stats):
            line = line.strip()
            (tag, val) = line.split("\t")
            stats[tag] = (float(val))
        isize_mean = stats["fragment_length"]
        isize_sdev = stats["fragment_length_SD"]
        rlen_mean = stats["read_length"]
        rlen_sdev = stats["read_length_SD"]
        print "mean fragment length taken from config file: %.2f" % (isize_mean)
        print "standard deviation of fragment_length: %.2f" % (isize_sdev)
        print "mean read length: %.2f" % (rlen_mean)
        print "standard deviation of read length: %.2f" % (rlen_sdev)

    else:
        # Get the mean and sdev of insert size from the original bam file
        print "calculating mean insert size..."
        iterations = 1000000
        (isize_mean, isize_sdev, rlen_mean, rlen_sdev) = psorted_bam_reader.calculate_mean_sdev_isize(iterations)
        print "mean fragment length over %d reads: %.2f" % (iterations, isize_mean)
        print "standard deviation of fragment_length: %.2f" % (isize_sdev)
        print "mean read length: %.2f" % (rlen_mean)
        print "standard deviation of read length: %.2f" % (rlen_sdev)

        stats_file = open(output_prefix + ".read_stats.txt", "w")
        stats_file.write("fragment_length\t%.2f\n" % (isize_mean))
        stats_file.write("fragment_length_SD\t%.2f\n" % (isize_sdev))
        stats_file.write("read_length\t%.2f\n" % (rlen_mean))
        stats_file.write("read_length_SD\t%.2f" % (rlen_sdev))

        stats_file.close()

        #if fragment sdev is much larger than expected, there might be a problem with the reads of the mapping. Set as default 0.1*fragment length as a reasonable guess. 
        # This is necessary because aberrant values for sdev will mess up the interval overlap calculation and the filtering 
        if isize_sdev > 0.2*isize_mean:
            isize_sdev = 0.1*isize_mean
            print "WARNING: fragment length standard deviation seems way too large to be realistic.\\n\
            There is maybe something weird with the flags in your bam mapping, or a very large number of large SV \\n\
            that are messing up the count.\\n\
            Setting the stdev to 0.1*fragment_length = %.2f for downstream calculations" % isize_sdev



        time = datetime.datetime.now()
        print "elapsed time: " + str(time - start_time)

    ################# Find valid discordant reads in given sorted bam file ################
    # This will print bam file(s) with the set(s) of valid discordant reads meaning
    # that the reads in  apair are mapped to distances greater than expected or to
    # two different chromosomes, and with at least one read that is mapped uniquely

    # if strict_repetitive is TRUE, will print out two bam files:
    # <bam_file_name>.valid_discordant_pairs_strict_rep.bam which is all valid discordant
    # read pairs with exactly one uniquely mapping and one repetitively mapping read pair
    #
    # <bam_file_name>.valid_discordant_pairs.bam which are all discordant valid discordant
    # read pairs with two uniquely mapping reads

    # if strict_repetitive is TRUE, will output both sets to a file named
    # <bam_file_name>.valid_discordant_pairs.bam

    #if have already run the prgm and calculated the discordant reads, dont do it again and look for a file called
    #<bam_file_name>.valid_discordant_pairs or <bam_file_name>.valid_discordant_pairs_strict_rep depending on the value of -s
    if not already_calc_discordant_reads:

        if mem_debug:
            reportResource("2")
        
        valid_discordant_reads_file_name = output_prefix + ".valid_discordant_pairs.bam"
        database_file=output_prefix+"dbfile.sqlite"  

        print "selecting discordant reads..."
        #this writes the bam file of discordant reads to disc to be used later, and return the counts of different types of reads  
        (bam_stats, ref_lengths, ref_names) = psorted_bam_reader.select_discordant_reads_psorted( verbose, isize_mean, valid_discordant_reads_file_name)
        #print ref_names, ref_lengths
        coverage = (bam_stats["total_reads"] * rlen_mean )/ sum(ref_lengths)

        filter_conf_file = open(output_prefix + ".filter_config.txt", "w")
        filter_conf_file.write("cluster_size\t2\t%d\n" % (5*coverage))
        filter_conf_file.write("span\t2\t%d\n" % isize_mean)
        filter_conf_file.write("int_size\t%d\t%d\n" % (rlen_mean, 2*(isize_mean + 2*isize_sdev - (rlen_mean - rlen_sdev))) )
        filter_conf_file.write("softclipped\t2\t%d\n" % (5*coverage))
        filter_conf_file.write("pick_consistent\t0\t-1")

        filter_conf_file.close()

        bam_stats_file = open(output_prefix + ".bam_stats.txt", "w")
        for key, value in bam_stats.items():
            bam_stats_file.write("%s\t%d\n" % (key, value))

        if mem_debug:
            reportResource("3")
        bam_stats_file.close()

        time = datetime.datetime.now()
        if mem_debug:
            reportResource("4")
        print "elapsed time: " + str(time - start_time)


#        print "sorting proper pair bam file in the background... "
#        args = ["samtools", "sort", output_prefix + ".proper_pair.bam", output_prefix + ".proper_pair.sorted"]
#        proper_pair_sort = subprocess.Popen(args)
    else:
        print "using already selected discordant reads in %s" % (valid_discordant_reads_file_name)



    ################### Select valid discordant reads that match a TE #####################
    # Of the valid discordant read pairs, select those that have exactly one side that
    # overlaps a TE. This will
    # return a list of AlignedReadPair objects
    # the interval size in calculated as the INSIDE interval between the two reads ?? is this true? , plus numsdev * the sd of the insert size

    print "selecting discordant read pairs where exactly one maps to a TE..."
    interval_size = isize_mean + num_sdev * isize_sdev

    if te_annot:
        discordant_bam_reader = BamReader(valid_discordant_reads_file_name, output_prefix)
        read_pair_one_overlap_TE_list = discordant_bam_reader.select_read_pair_one_overlap_TE_annot(te_annot, interval_size, min_mapq,database_file,bin_size)
        read_pair_one_overlap_TE_list=[1,1]
        if not (print_extra_output or already_calc_discordant_reads):
            os.remove(valid_discordant_reads_file_name)

    else:

        #here you would map mate reads to TE sequences and whatnot.
        pass

    if mem_debug:
            reportResource("5")
    time = datetime.datetime.now()
    print "elapsed time: " + str(time - start_time)

    ######################## wait till the proper pair bam file is sorted, and index it ###########################




    ##################### Cluster discordant read pairs that match TE #####################
    # Cluster the list of AlignedReadPair objects that have one read overlapping a TE
    # according to the uniquely mapping non-TE genomic location
    # then pair a fwd and rev cluster if they are overlapping
    # for the clusters that can be paired, calculate the softclipped support and the core reads, which will indicate heterozygosity of predicted TE insertion

    print "generating clusters..."
    if len(read_pair_one_overlap_TE_list) == 0:
        print "there might be an error, no discordant reads mapped to a TE location. please check the gff file... are the chromosome names the same as in the reference?"
        sys.exit(2)
    cluster_list = ClusterList(read_pair_one_overlap_TE_list)


####    COMMENTED TO TAKE ADVANTAGE OF THE GENERATED DATABASE FILE###
    #if not parallel:
    #    (cluster_pairs, unpaired_fwd_clusters, unpaired_rev_clusters, bed_string) = cluster_list.generate_clusters(verbose, psorted_bamfile_name, "", False, min_cluster_size)
    #    all_clusters = [(cluster_pairs, unpaired_fwd_clusters, unpaired_rev_clusters)]
    ##parallel version:
    #else:
    #    # last tow args are bed file handle and streaming, unnecessary and False, in this version 
    #    all_clusters = cluster_list.generate_clusters_parallel(verbose, num_CPUs, bin_size, psorted_bamfile_name, "", False, min_cluster_size)

    bed_file_name = output_prefix + ".insertion_regions.bed"
    bed_file_handle = open(bed_file_name, "w")
    all_clusters = cluster_list.generate_clusters_db(database_file,bin_size,output_prefix,"", verbose, bed_file_handle, True, min_cluster_size)
    all_clusters=list(load_pickle(output_prefix+'all_clusters.pkl'))

    del read_pair_one_overlap_TE_list
    gc.collect()
    if mem_debug:
            reportResource("5")
    time = datetime.datetime.now()
    print "elapsed time: " + str(time - start_time)


    ###################### print reads that were clustered to bam, output to gff and table ########################################


    print "writing clustered reads to bam file, writing to gff and tables... "


    pair_gff_output_file = open(output_prefix + ".TE_insertions_paired_clusters.gff3", "w")
    pair_table_output_file = open(output_prefix + ".TE_insertions_paired_clusters.supporting_clusters.table", "w")
    pair_table_output_file.write(table_header(library_name, library_name, te_annot))

    # if print_extra_output: 
    #     single_gff_output_file = open(output_prefix + ".TE_insertions_single_cluster.gff3", "w")
    #     single_table_output_file = open(output_prefix + ".TE_insertions_single_cluster.supporting_clusters.table", "w")
    #     single_table_output_file.write(table_header(library_name, library_name, te_annot))

    print len(all_clusters)
    cluster_ID = 0
    for (cluster_pairs, unpaired_fwd_clusters, unpaired_rev_clusters,strings) in all_clusters:


        # unpaired clusters are no longer reported
        # if print_extra_output:
        #     for fwd_cluster in unpaired_fwd_clusters:
        #         single_gff_output_file.write(fwd_cluster.to_gff(cluster_ID, library_name, TE_name_tag) + "\n")
        #         single_table_output_file.write(fwd_cluster.to_table(cluster_ID, library_name))

        #         cluster_ID += 1

        #     for rev_cluster in unpaired_rev_clusters:
        #         single_gff_output_file.write(rev_cluster.to_gff(cluster_ID, library_name, TE_name_tag) + "\n")
        #         single_table_output_file.write(rev_cluster.to_table(cluster_ID, library_name))
        #         cluster_ID += 1

        #print cluster_ID
        for cluster_pair in cluster_pairs:
            pair_gff_output_file.write(cluster_pair.to_gff(cluster_ID, library_name, TE_name_tag) + "\n")
            pair_table_output_file.write(cluster_pair.to_table(cluster_ID, library_name))
            cluster_ID += 1

    #clustered_reads_bam_file.close()
    time = datetime.datetime.now()
    print "elapsed time: " + str(time - start_time)

    pair_gff_output_file.close()
    pair_table_output_file.close()

    # if print_extra_output:
    #     single_gff_output_file.close()
    #     single_table_output_file.close()

    
    end_time = str(datetime.datetime.now())
    print "done! at " + end_time

    run_stats = open(output_prefix + ".run_stats.txt", "w")
    run_stats.write("lib\t%s\n" % (library_name))
    if not already_calc_discordant_reads:
        run_stats.write("coverage\t%s\n" % (coverage))
    run_stats.write("runtime\t%s\n" % ( datetime.datetime.now() - start_time))
    run_stats.write("numCPUs\t%s\n" % (num_CPUs))
    run_stats.close()

Example #4

Show file

File: Run_TE_ID_reseq_streaming.py Project: MischaLundberg/jitterbug

    ##################### Cluster discordant read pairs that match TE #####################
    # Cluster the list of AlignedReadPair objects that have one read overlapping a TE
    # according to the uniquely mapping non-TE genomic location
    # then pair a fwd and rev cluster if they are overlapping
    # for the clusters that can be paired, calculate the softclipped support and the core reads, which will indicate heterozygosity of predicted TE insertion

    # bed file to store the insertion regions, to use to query the bam file later to calculate the zygosity
    bed_file_name = output_prefix + ".insertion_regions.bed"
    bed_file_handle = open(bed_file_name, "w")

    print "generating clusters..."
    if len(read_pair_one_overlap_TE_list) == 0:
        print "there might be an error, no discordant reads mapped to a TE location. please check the gff file... are the chromosome names the same as in the reference?"
        sys.exit(2)
    cluster_list = ClusterList(read_pair_one_overlap_TE_list)
    
    #if not parallel:
    #    (cluster_pairs, unpaired_fwd_clusters, unpaired_rev_clusters) = cluster_list.generate_clusters(verbose, psorted_bamfile_name, bed_file_handle, True, min_cluster_size)
    #    all_clusters = [(cluster_pairs, unpaired_fwd_clusters, unpaired_rev_clusters)]
    ##parallel version:
    #else:
    #    # empty string is psorted bed file name, and True is streaming
    #    all_clusters = cluster_list.generate_clusters_parallel(verbose, num_CPUs, bin_size, "", bed_file_handle, True, min_cluster_size,output_prefix)
    #    # bed_file_handle.close()
    
    ##### Alternative with low RAM because it prints out all the results at once and use the database file sqlite from before.
    all_clusters = cluster_list.generate_clusters_db(database_file,bin_size,output_prefix,"", verbose, bed_file_handle, True, min_cluster_size)
 
    ### retrieve reads in the intervals where insertions were predicted and use them to calculate allelic frequency (zygosity) of the predictions
    ins_regions_bam_name = output_prefix + ".insertion_regions.reads.bam"