Example #1
0
def pair_clusters_by_bin((key, fwd_clusters, rev_clusters, bam_file_name, verbose)):


    print "processing cluster pairs on %s" % (key)
    #print "pairing clusters in parallel for chr %s" % fwd_clusters[0].chr
    non_overlapping_fwd_clusters = remove_overlapping_clusters(fwd_clusters)
    if verbose:
        print "non overlapping fwd clusters\t%d" % (len(non_overlapping_fwd_clusters))
    non_overlapping_rev_clusters = remove_overlapping_clusters(rev_clusters)
    if verbose:
        print "non overlapping rev clusters\t%d" % (len(non_overlapping_rev_clusters))
    proper_pair_bam = pysam.Samfile(bam_file_name, "rb")
    #print "haha"


    #print "ok1"
    #pair clusters by genomic location, keeping track of which indices in the array have been paired, so that you can pick out the unpaired ones after
    cluster_pairs = []
    paired_fwd_clusters_indices = []
    paired_rev_clusters_indices = []
    for fwd_index, fwd_cluster in enumerate(non_overlapping_fwd_clusters):
        for rev_index, rev_cluster in enumerate(non_overlapping_rev_clusters):
            if fwd_cluster.is_overlapping_strict(rev_cluster):
                new_cluster_pair = ClusterPair(fwd_cluster, rev_cluster)
                #print new_cluster_pair.get_chr()
                reads = proper_pair_bam.fetch(new_cluster_pair.get_chr(), new_cluster_pair.get_insertion_int_start(), new_cluster_pair.get_insertion_int_end())
                new_cluster_pair.calc_zygosity(reads)
                #print "poop"
                if new_cluster_pair.get_insertion_int_end() < new_cluster_pair.get_insertion_int_start():
                    if True:
                        print "cluster pair not paired!"
                else:
                    cluster_pairs.append(new_cluster_pair)
                    paired_fwd_clusters_indices.append(fwd_index)
                    paired_rev_clusters_indices.append(rev_index)

    #make lists of unpaired clusters
    unpaired_fwd_clusters = []
    unpaired_rev_clusters = []
    for fwd_index in range(len(non_overlapping_fwd_clusters)):
        if fwd_index not in paired_fwd_clusters_indices:
            unpaired_fwd_clusters.append(non_overlapping_fwd_clusters[fwd_index])

    for rev_index in range(len(non_overlapping_rev_clusters)):
        if rev_index not in paired_rev_clusters_indices:
            unpaired_rev_clusters.append(non_overlapping_rev_clusters[rev_index])

    return (cluster_pairs, unpaired_fwd_clusters, unpaired_rev_clusters)
Example #2
0
def pair_clusters_by_bin((key, fwd_clusters, rev_clusters, bam_file_name, verbose, bed_file_handle, streaming, min_cluster_size)):


    print "processing cluster pairs on %s" % (key)
    #print "pairing clusters in parallel for chr %s" % fwd_clusters[0].chr
    non_overlapping_fwd_clusters = remove_overlapping_clusters(fwd_clusters,min_cluster_size)
    if verbose:
        print "non overlapping fwd clusters\t%d" % (len(non_overlapping_fwd_clusters))
    non_overlapping_rev_clusters = remove_overlapping_clusters(rev_clusters,min_cluster_size)
    if verbose:
        print "non overlapping rev clusters\t%d" % (len(non_overlapping_rev_clusters))
    if not streaming:
        proper_pair_bam = pysam.Samfile(bam_file_name, "rb")
    #print "haha"


    #print "ok1"
    #pair clusters by genomic location, keeping track of which indices in the array have been paired, so that you can pick out the unpaired ones after
    cluster_pairs = []
    paired_fwd_clusters_indices = []
    paired_rev_clusters_indices = []
    last_intersect=0
    bed_string = ""
    for fwd_index  in range(0,len(non_overlapping_fwd_clusters)):
            #if fwd_cluster.num_reads < min_cluster_size:
            #    continue
            fwd_cluster=non_overlapping_fwd_clusters[fwd_index]
            for rev_index in range(last_intersect, len(non_overlapping_rev_clusters)):
                #if rev_cluster.num_reads < min_cluster_size:
                #    continue
                rev_cluster=non_overlapping_rev_clusters[rev_index]
                if fwd_cluster.is_overlapping_strict(rev_cluster):
                    new_cluster_pair = ClusterPair(fwd_cluster, rev_cluster)
                    last_intersect=rev_index
                    #print new_cluster_pair.get_chr()
                    if not streaming:
                        reads = proper_pair_bam.fetch(new_cluster_pair.get_chr(), new_cluster_pair.get_insertion_int_start(), new_cluster_pair.get_insertion_int_end())
                        new_cluster_pair.calc_zygosity(reads)
                    else:
                        bed_line = new_cluster_pair.to_bed()
                        bed_string = bed_string + "\n" + bed_line
                    #print "poop"
                    if new_cluster_pair.get_insertion_int_end() < new_cluster_pair.get_insertion_int_start():
                        if True:
                            print "cluster pair not paired!"
                    else:
                        cluster_pairs.append(new_cluster_pair)
                        paired_fwd_clusters_indices.append(fwd_index)
                        paired_rev_clusters_indices.append(rev_index)
                elif  fwd_cluster.intersection_end < rev_cluster.intersection_start:
                    break

    #make lists of unpaired clusters
    unpaired_fwd_clusters = []
    unpaired_rev_clusters = []
    for fwd_index in range(len(non_overlapping_fwd_clusters)):
        if fwd_index not in paired_fwd_clusters_indices:
            unpaired_fwd_clusters.append(non_overlapping_fwd_clusters[fwd_index])

    for rev_index in range(len(non_overlapping_rev_clusters)):
        if rev_index not in paired_rev_clusters_indices:
            unpaired_rev_clusters.append(non_overlapping_rev_clusters[rev_index])

    if streaming:
        return (cluster_pairs, unpaired_fwd_clusters, unpaired_rev_clusters, bed_string)
    else:
        return (cluster_pairs, unpaired_fwd_clusters, unpaired_rev_clusters)
Example #3
0
    def generate_clusters(self, verbose, psorted_bamfile_name, bed_file_handle, streaming, min_cluster_size):
##################### BEGIN NON PARALLEL VERSION ######################################
        #cluster fwd intervals
        fwd_read_pairs = [read_pair for read_pair in self.read_pair_list if read_pair.interval_direction == "fwd"]
        fwd_clusters = cluster_read_pairs_all(fwd_read_pairs)

        print "******************total fwd clusters found: %d" %  len(fwd_clusters)
        non_overlapping_fwd_clusters = remove_overlapping_clusters(fwd_clusters,min_cluster_size)
        print "******************total fwd non-overlapping clusters found: %d" %  len(non_overlapping_fwd_clusters)


        #cluster rev intervals
        rev_read_pairs = [read_pair for read_pair in self.read_pair_list if read_pair.interval_direction == "rev"]
        rev_clusters = cluster_read_pairs_all(rev_read_pairs)

        print "******************total rev clusters found: %d" % len(rev_clusters)
        non_overlapping_rev_clusters = remove_overlapping_clusters(rev_clusters,min_cluster_size)
        print "******************total rev non-overlapping clusters found: %d" %  len(non_overlapping_rev_clusters)

        #bam_file_name = output_prefix + ".proper_pair.sorted.bam"
        psorted_bamfile = pysam.Samfile(psorted_bamfile_name, "rb")


        #pair clusters by genomic location, keeping track of which indices in the array have been paired, so that you can pick out the unpaired ones after
        cluster_pairs = []
        paired_fwd_clusters_indices = []
        paired_rev_clusters_indices = []
        bed_string = ""
        
        last_intersect=0
        # iterate over combinations of fwd and rev clusters, skipping if clusters dont meet min size requirements
        for fwd_index  in range(0,len(non_overlapping_fwd_clusters)):
            #if fwd_cluster.num_reads < min_cluster_size:
            #    continue
            fwd_cluster=non_overlapping_fwd_clusters[fwd_index]
            for rev_index in range(last_intersect, len(non_overlapping_rev_clusters)):
                #if rev_cluster.num_reads < min_cluster_size:
                #    continue
                rev_cluster=non_overlapping_rev_clusters[rev_index]
                if fwd_cluster.is_overlapping_strict(rev_cluster):
                    last_intersect=rev_index
                    new_cluster_pair = ClusterPair(fwd_cluster, rev_cluster)
                    if not streaming:
                        reads = proper_pair_bam.fetch(new_cluster_pair.get_chr(), new_cluster_pair.get_insertion_int_start(), new_cluster_pair.get_insertion_int_end())
                        new_cluster_pair.calc_zygosity(reads)
                    else:
                        bed_line = new_cluster_pair.to_bed()
                        if bed_string == "":
                            bed_string = bed_line
                        else:
                            bed_string = bed_string + "\n" + bed_line
                    if new_cluster_pair.insertion_int_end < new_cluster_pair.insertion_int_start:
                        if True:
                            print "cluster pair not paired!"
                    else:
                        cluster_pairs.append(new_cluster_pair)
                        paired_fwd_clusters_indices.append(fwd_index)
                        paired_rev_clusters_indices.append(rev_index)
                elif  fwd_cluster.intersection_end < rev_cluster.intersection_start:
                    break
        #make lists of unpaired clusters
        unpaired_fwd_clusters = []
        unpaired_rev_clusters = []
        for fwd_index in range(len(non_overlapping_fwd_clusters)):
            if fwd_index not in paired_fwd_clusters_indices:
                unpaired_fwd_clusters.append(non_overlapping_fwd_clusters[fwd_index])

        for rev_index in range(len(non_overlapping_rev_clusters)):
            if rev_index not in paired_rev_clusters_indices:
                unpaired_rev_clusters.append(non_overlapping_rev_clusters[rev_index])



        print "******************total cluster pairs found: %d" %  len(cluster_pairs)
        if verbose:
            for (fwd_cluster, rev_cluster) in cluster_pairs:
                print "*************************cluster_pair:**************************************"
                print "fwd cluster:"
                print "cluster coordinates: %s %d %d" % (fwd_cluster[0].interval_chr, fwd_cluster[0].interval_start, fwd_cluster[-1].interval_end )
                print " ".join(read.str_int() for read in fwd_cluster)
                print " ".join(read.str_TE_annot_list() for read in fwd_cluster)
                print "rev cluster:"
                print "cluster coordinates: %s %d %d" % (rev_cluster[0].interval_chr, rev_cluster[0].interval_start, rev_cluster[-1].interval_end )
                print " ".join(read.str_int() for read in rev_cluster)
                print " ".join(read.str_TE_annot_list() for read in rev_cluster)

        return (cluster_pairs, unpaired_fwd_clusters, unpaired_rev_clusters, bed_string)
Example #4
0
def pair_clusters_by_bin(
    (key, fwd_clusters, rev_clusters, bam_file_name, verbose, bed_file_handle,
     streaming, min_cluster_size)):

    print "processing cluster pairs on %s" % (key)
    #print "pairing clusters in parallel for chr %s" % fwd_clusters[0].chr
    non_overlapping_fwd_clusters = remove_overlapping_clusters(
        fwd_clusters, min_cluster_size)
    if verbose:
        print "non overlapping fwd clusters\t%d" % (
            len(non_overlapping_fwd_clusters))
    non_overlapping_rev_clusters = remove_overlapping_clusters(
        rev_clusters, min_cluster_size)
    if verbose:
        print "non overlapping rev clusters\t%d" % (
            len(non_overlapping_rev_clusters))
    if not streaming:
        proper_pair_bam = pysam.Samfile(bam_file_name, "rb")
    #print "haha"

    #print "ok1"
    #pair clusters by genomic location, keeping track of which indices in the array have been paired, so that you can pick out the unpaired ones after
    cluster_pairs = []
    paired_fwd_clusters_indices = []
    paired_rev_clusters_indices = []
    last_intersect = 0
    bed_string = ""
    for fwd_index in range(0, len(non_overlapping_fwd_clusters)):
        #if fwd_cluster.num_reads < min_cluster_size:
        #    continue
        fwd_cluster = non_overlapping_fwd_clusters[fwd_index]
        for rev_index in range(last_intersect,
                               len(non_overlapping_rev_clusters)):
            #if rev_cluster.num_reads < min_cluster_size:
            #    continue
            rev_cluster = non_overlapping_rev_clusters[rev_index]
            if fwd_cluster.is_overlapping_strict(rev_cluster):
                new_cluster_pair = ClusterPair(fwd_cluster, rev_cluster)
                last_intersect = rev_index
                #print new_cluster_pair.get_chr()
                if not streaming:
                    reads = proper_pair_bam.fetch(
                        new_cluster_pair.get_chr(),
                        new_cluster_pair.get_insertion_int_start(),
                        new_cluster_pair.get_insertion_int_end())
                    new_cluster_pair.calc_zygosity(reads)
                else:
                    bed_line = new_cluster_pair.to_bed()
                    bed_string = bed_string + "\n" + bed_line
                #print "poop"
                if new_cluster_pair.get_insertion_int_end(
                ) < new_cluster_pair.get_insertion_int_start():
                    if True:
                        print "cluster pair not paired!"
                else:
                    cluster_pairs.append(new_cluster_pair)
                    paired_fwd_clusters_indices.append(fwd_index)
                    paired_rev_clusters_indices.append(rev_index)
            elif fwd_cluster.intersection_end < rev_cluster.intersection_start:
                break

    #make lists of unpaired clusters
    unpaired_fwd_clusters = []
    unpaired_rev_clusters = []
    for fwd_index in range(len(non_overlapping_fwd_clusters)):
        if fwd_index not in paired_fwd_clusters_indices:
            unpaired_fwd_clusters.append(
                non_overlapping_fwd_clusters[fwd_index])

    for rev_index in range(len(non_overlapping_rev_clusters)):
        if rev_index not in paired_rev_clusters_indices:
            unpaired_rev_clusters.append(
                non_overlapping_rev_clusters[rev_index])

    if streaming:
        return (cluster_pairs, unpaired_fwd_clusters, unpaired_rev_clusters,
                bed_string)
    else:
        return (cluster_pairs, unpaired_fwd_clusters, unpaired_rev_clusters)
Example #5
0
    def generate_clusters(self, verbose, psorted_bamfile_name, bed_file_handle,
                          streaming, min_cluster_size):
        ##################### BEGIN NON PARALLEL VERSION ######################################
        #cluster fwd intervals
        fwd_read_pairs = [
            read_pair for read_pair in self.read_pair_list
            if read_pair.interval_direction == "fwd"
        ]
        fwd_clusters = cluster_read_pairs_all(fwd_read_pairs)

        print "******************total fwd clusters found: %d" % len(
            fwd_clusters)
        non_overlapping_fwd_clusters = remove_overlapping_clusters(
            fwd_clusters, min_cluster_size)
        print "******************total fwd non-overlapping clusters found: %d" % len(
            non_overlapping_fwd_clusters)

        #cluster rev intervals
        rev_read_pairs = [
            read_pair for read_pair in self.read_pair_list
            if read_pair.interval_direction == "rev"
        ]
        rev_clusters = cluster_read_pairs_all(rev_read_pairs)

        print "******************total rev clusters found: %d" % len(
            rev_clusters)
        non_overlapping_rev_clusters = remove_overlapping_clusters(
            rev_clusters, min_cluster_size)
        print "******************total rev non-overlapping clusters found: %d" % len(
            non_overlapping_rev_clusters)

        #bam_file_name = output_prefix + ".proper_pair.sorted.bam"
        psorted_bamfile = pysam.Samfile(psorted_bamfile_name, "rb")

        #pair clusters by genomic location, keeping track of which indices in the array have been paired, so that you can pick out the unpaired ones after
        cluster_pairs = []
        paired_fwd_clusters_indices = []
        paired_rev_clusters_indices = []
        bed_string = ""

        last_intersect = 0
        # iterate over combinations of fwd and rev clusters, skipping if clusters dont meet min size requirements
        for fwd_index in range(0, len(non_overlapping_fwd_clusters)):
            #if fwd_cluster.num_reads < min_cluster_size:
            #    continue
            fwd_cluster = non_overlapping_fwd_clusters[fwd_index]
            for rev_index in range(last_intersect,
                                   len(non_overlapping_rev_clusters)):
                #if rev_cluster.num_reads < min_cluster_size:
                #    continue
                rev_cluster = non_overlapping_rev_clusters[rev_index]
                if fwd_cluster.is_overlapping_strict(rev_cluster):
                    last_intersect = rev_index
                    new_cluster_pair = ClusterPair(fwd_cluster, rev_cluster)
                    if not streaming:
                        reads = proper_pair_bam.fetch(
                            new_cluster_pair.get_chr(),
                            new_cluster_pair.get_insertion_int_start(),
                            new_cluster_pair.get_insertion_int_end())
                        new_cluster_pair.calc_zygosity(reads)
                    else:
                        bed_line = new_cluster_pair.to_bed()
                        if bed_string == "":
                            bed_string = bed_line
                        else:
                            bed_string = bed_string + "\n" + bed_line
                    if new_cluster_pair.insertion_int_end < new_cluster_pair.insertion_int_start:
                        if True:
                            print "cluster pair not paired!"
                    else:
                        cluster_pairs.append(new_cluster_pair)
                        paired_fwd_clusters_indices.append(fwd_index)
                        paired_rev_clusters_indices.append(rev_index)
                elif fwd_cluster.intersection_end < rev_cluster.intersection_start:
                    break
        #make lists of unpaired clusters
        unpaired_fwd_clusters = []
        unpaired_rev_clusters = []
        for fwd_index in range(len(non_overlapping_fwd_clusters)):
            if fwd_index not in paired_fwd_clusters_indices:
                unpaired_fwd_clusters.append(
                    non_overlapping_fwd_clusters[fwd_index])

        for rev_index in range(len(non_overlapping_rev_clusters)):
            if rev_index not in paired_rev_clusters_indices:
                unpaired_rev_clusters.append(
                    non_overlapping_rev_clusters[rev_index])

        print "******************total cluster pairs found: %d" % len(
            cluster_pairs)
        if verbose:
            for (fwd_cluster, rev_cluster) in cluster_pairs:
                print "*************************cluster_pair:**************************************"
                print "fwd cluster:"
                print "cluster coordinates: %s %d %d" % (
                    fwd_cluster[0].interval_chr, fwd_cluster[0].interval_start,
                    fwd_cluster[-1].interval_end)
                print " ".join(read.str_int() for read in fwd_cluster)
                print " ".join(read.str_TE_annot_list()
                               for read in fwd_cluster)
                print "rev cluster:"
                print "cluster coordinates: %s %d %d" % (
                    rev_cluster[0].interval_chr, rev_cluster[0].interval_start,
                    rev_cluster[-1].interval_end)
                print " ".join(read.str_int() for read in rev_cluster)
                print " ".join(read.str_TE_annot_list()
                               for read in rev_cluster)

        return (cluster_pairs, unpaired_fwd_clusters, unpaired_rev_clusters,
                bed_string)
Example #6
0
    def generate_clusters(self, verbose, psorted_bamfile_name):
##################### BEGIN NON PARALLEL VERSION ######################################
        #cluster fwd intervals
        fwd_read_pairs = [read_pair for read_pair in self.read_pair_list if read_pair.interval_direction == "fwd"]
        fwd_clusters = cluster_read_pairs_all(fwd_read_pairs)

        print "******************total fwd clusters found: %d" %  len(fwd_clusters)
        non_overlapping_fwd_clusters = remove_overlapping_clusters(fwd_clusters)
        print "******************total fwd non-overlapping clusters found: %d" %  len(non_overlapping_fwd_clusters)


        #cluster rev intervals
        rev_read_pairs = [read_pair for read_pair in self.read_pair_list if read_pair.interval_direction == "rev"]
        rev_clusters = cluster_read_pairs_all(rev_read_pairs)

        print "******************total rev clusters found: %d" % len(rev_clusters)
        non_overlapping_rev_clusters = remove_overlapping_clusters(rev_clusters)
        print "******************total rev non-overlapping clusters found: %d" %  len(non_overlapping_rev_clusters)

        #bam_file_name = output_prefix + ".proper_pair.sorted.bam"
        psorted_bamfile = pysam.Samfile(psorted_bamfile_name, "rb")


        #pair clusters by genomic location, keeping track of which indices in the array have been paired, so that you can pick out the unpaired ones after
        cluster_pairs = []
        paired_fwd_clusters_indices = []
        paired_rev_clusters_indices = []
        for fwd_index, fwd_cluster in enumerate(non_overlapping_fwd_clusters):
            for rev_index, rev_cluster in enumerate(non_overlapping_rev_clusters):
                if fwd_cluster.is_overlapping_strict(rev_cluster):
                    new_cluster_pair = ClusterPair(fwd_cluster, rev_cluster)
                    reads = psorted_bamfile.fetch(new_cluster_pair.get_chr(), new_cluster_pair.get_insertion_int_start(), new_cluster_pair.get_insertion_int_end())
                    new_cluster_pair.calc_zygosity(reads)
                    if new_cluster_pair.insertion_int_end < new_cluster_pair.insertion_int_start:
                        if True:
                            print "cluster pair not paired!"
                    else:
                        cluster_pairs.append(new_cluster_pair)
                        paired_fwd_clusters_indices.append(fwd_index)
                        paired_rev_clusters_indices.append(rev_index)

        #make lists of unpaired clusters
        unpaired_fwd_clusters = []
        unpaired_rev_clusters = []
        for fwd_index in range(len(non_overlapping_fwd_clusters)):
            if fwd_index not in paired_fwd_clusters_indices:
                unpaired_fwd_clusters.append(non_overlapping_fwd_clusters[fwd_index])

        for rev_index in range(len(non_overlapping_rev_clusters)):
            if rev_index not in paired_rev_clusters_indices:
                unpaired_rev_clusters.append(non_overlapping_rev_clusters[rev_index])



        print "******************total cluster pairs found: %d" %  len(cluster_pairs)
        if verbose:
            for (fwd_cluster, rev_cluster) in cluster_pairs:
                print "*************************cluster_pair:**************************************"
                print "fwd cluster:"
                print "cluster coordinates: %s %d %d" % (fwd_cluster[0].interval_chr, fwd_cluster[0].interval_start, fwd_cluster[-1].interval_end )
                print " ".join(read.str_int() for read in fwd_cluster)
                print " ".join(read.str_TE_annot_list() for read in fwd_cluster)
                print "rev cluster:"
                print "cluster coordinates: %s %d %d" % (rev_cluster[0].interval_chr, rev_cluster[0].interval_start, rev_cluster[-1].interval_end )
                print " ".join(read.str_int() for read in rev_cluster)
                print " ".join(read.str_TE_annot_list() for read in rev_cluster)

        return (cluster_pairs, unpaired_fwd_clusters, unpaired_rev_clusters)