Beispiel #1
0
def filter_itxs(feature):
    n = len(feature.fields) / 2
    del_interval_idp = map(int, feature.fields[7].split("-"))
    del_interval_itx_1 = map(int,
                             feature.fields[n + 7].split(",")[0].split("-"))
    del_interval_itx_2 = map(int,
                             feature.fields[n + 7].split(",")[1].split("-"))
    if filter(lambda x: abs(x[0] - del_interval_idp[0]) + abs(
                    x[1] - del_interval_idp[1]) == 0,
              [del_interval_itx_1, del_interval_itx_2]) and "LowQual" not in \
            feature.fields[n + 4]:
        return None
    return pybedtools.Interval(feature.chrom,
                               feature.start,
                               feature.end,
                               name=feature.name,
                               score=feature.score,
                               otherfields=feature.fields[6:n])
Beispiel #2
0
def chop(chr, start,end, wsize=5):
    
    """
    
    writes some sort of phastcons thing..., not quite sure
    
    For circos plotting, not used add back if we want more output later, ignore for now
    
    """
    
    file = open("phastcons.txt", 'w')
    i = start
    while i < end:
        x = pybedtools.Interval(chr, i, (i+wsize))
        p = get_mean_phastcons(x, species="hg19")
        file.write("\t".join(map(str, [chr, i, (i+wsize-1), p])) + "\n")
        i += wsize
    file.close()
Beispiel #3
0
 def __getitem__(self, key):
     iterator = self.fileobj.fetch(
         str(key.chrom),
         key.start,
         key.stop)
     for r in iterator:
         start = r.pos
         curr_end = r.pos
         for op, bp in r.cigar:
             start = curr_end
             curr_end += bp
             if op == 0:
                 interval = pybedtools.Interval(
                     self.fileobj.references[r.rname],
                     start,
                     curr_end,
                     strand=strand_lookup[r.flag & 0x0010])
                 interval.file_type = 'bed'
                 yield interval
def add_weighted_score(in_bed, score_bed):
    out_bed = in_bed.intersect(score_bed, wao=True).saveas(os.path.join(args.tmpdir, "score.bed"))

    bed_array = []
    last_interval = pybedtools.Interval("", 0, 0)
    map_value = 0.0
    for interval in out_bed:
        if interval.chrom != last_interval.chrom or interval.start != last_interval.start or interval.end != last_interval.end:
            if last_interval.chrom:
                bed_array.append(tuple(last_interval.fields[:-5]) + (str(map_value),))
            map_value = 0.0
            last_interval = interval
        if float(interval.fields[-1]) > 0:
            map_value += float(interval.fields[-1]) * float(interval.fields[-2]) / float(interval.length)

    if last_interval.chrom:
        bed_array.append(tuple(last_interval.fields[:-5]) + (str(map_value),))

    return pybedtools.BedTool(bed_array)
Beispiel #5
0
def find_idp(feature, wiggle):
    n = len(feature.fields) / 2
    if feature.chrom != feature.fields[n]:
        return None
    start_dup = feature.start
    end_dup = feature.end
    start_del = int(feature.fields[n + 1])
    end_del = int(feature.fields[n + 2])
    if abs(start_del - end_del) > (abs(start_dup - end_dup) - wiggle):
        return None
    dist_ends = [abs(start_del - start_dup), abs(end_del - end_dup)]
    if min(dist_ends) > wiggle:
        return None
    del_pos = start_del if dist_ends[0] > dist_ends[1] else end_del
    name = "%s,%s" % (feature.name, feature.fields[n + 3])
    score = "%s,%s" % (feature.score, feature.fields[n + 4])
    return pybedtools.Interval(feature.chrom, feature.start, feature.end,
                               name=name, score=score,
                               otherfields=["%d" % del_pos,
                                            "%d-%d" % (start_del, end_del)])
Beispiel #6
0
def extract_candidate_split_regions(
        work, filtered_candidates_vcfs, split_regions, ensemble_beds,
        reference, matrix_base_pad, merge_d_for_short_read):
    logger = logging.getLogger(extract_candidate_split_regions.__name__)

    candidates_split_regions = []
    for i, (filtered_vcf, split_region_) in enumerate(zip(filtered_candidates_vcfs,
                                                          split_regions)):
        candidates_region_file = os.path.join(
            work, "candidates_region_{}.bed".format(i))
        candidates_bed = pybedtools.BedTool(filtered_vcf).each(
            lambda x: pybedtools.Interval(x[0], int(x[1]), int(x[1]) + len(x[3]))).sort().slop(
            g=reference + ".fai", b=matrix_base_pad + 3).merge(d=merge_d_for_short_read)
        if ensemble_beds:
            candidates_bed = candidates_bed.cat(ensemble_beds[i], postmerge=False).sort(
            ).merge(d=merge_d_for_short_read)
        candidates_bed.intersect(split_region_).sort().saveas(
            candidates_region_file)
        candidates_split_regions.append(candidates_region_file)
    return candidates_split_regions
Beispiel #7
0
def check_duplicates(interval1, interval2, max_dist=10):
    if interval1.chrom != interval2.chrom or \
                    abs(interval1.start - interval2.start) > max_dist or \
                    abs(interval1.end - interval2.end) > max_dist or \
                    interval1.fields[3].split(",")[1] != \
                    interval2.fields[3].split(",")[1]:
        return None
    info1 = json.loads(base64.b64decode(interval1.fields[3].split(",")[0]))
    info2 = json.loads(base64.b64decode(interval2.fields[3].split(",")[0]))
    svmethods = sorted(list(set(info1["SVMETHOD"] + info2["SVMETHOD"])))
    sources = []
    if "SOURCES" in info1:
        sources.append(info1["SOURCES"])
    if "SOURCES" in info2:
        sources.append(info2["SOURCES"])
    sources = ",".join(sources)
    if sources:
        info1["SOURCES"] = sources
    if "PASS" in [
            interval1.fields[7], interval2.fields[7]
    ] or ("AS" not in svmethods and len(set(svmethods) - {"SC", "AS"}) > 1):
        sv_filter = "PASS"
    else:
        sv_filter = "LowQual"

    end = max(interval1.end, interval2.end)
    start = min(interval1.start, interval2.start)
    info1.update({
        "END": end,
        "SVMETHOD": svmethods,
        "NUM_SVMETHODS": len(svmethods)
    })
    return pybedtools.Interval(
        interval1.chrom,
        start,
        end,
        name="%s,%s,%d,%s" %
        (base64.b64encode(json.dumps(info1)), info1["SVTYPE"], end - start,
         ";".join(svmethods)),
        score=interval1.score,
        otherfields=[interval1.fields[6], sv_filter])
Beispiel #8
0
def _regions_for_coverage(data, region, out_file):
    """Retrieve BedTool iterator over regions we need to calculate coverage in.
    """
    variant_regions = utils.get_in(data, ("config", "algorithm", "variant_regions"))
    ready_region = shared.subset_variant_regions(variant_regions, region, out_file)
    if not ready_region:
        return get_ref_bedtool(data["sam_ref"], data["config"])
    elif os.path.isfile(ready_region):
        return pybedtools.BedTool(ready_region).intervals
    elif isinstance(ready_region, (list, tuple)):
        c, s, e = ready_region
        return [pybedtools.Interval(c, s, e)]
    else:
        assert isinstance(ready_region, basestring)
        out = []
        for r in [x for x in get_ref_bedtool(data["sam_ref"], data["config"])
                  if x.chrom == ready_region]:
            # If we have variant regions but none in this region, don't calculate coverage
            r.attrs["no_coverage"] = variant_regions is not None
            out.append(r)
        return out
Beispiel #9
0
def add_pairwise_bedtool_track(pairs_bedpe_bt, track, action, binsize):
    """
    Extract feature values (as list) for all pairs vs. a BedTool (as BEDPE)
    """
    
    # Multiple actions all start with computing bedtools pairtopair -a both
    if action in 'count-pairs pairwise-coverage any-pairwise-overlap'.split():

        track_hits = pairs_bedpe_bt.pairtopair(b=track, type='both')
        if action in 'count-pairs any-pairwise-overlap'.split():
            counts_only = True
        else:
            counts_only = False
        hits_per_bin = _split_pairtopair_by_binpairs(track_hits, pairs_bedpe_bt, 
                                                         counts_only=counts_only)

        if action == 'count-pairs':
            values = list(hits_per_bin.values())

        elif action == 'any-pairwise-overlap':
            values = [min([1, k]) for k in hits_per_bin.values()]

        elif action == 'pairwise-coverage':
            values = []
            for pair, hits in hits_per_bin.items():
                if len(hits) > 0:
                    fields = pair.split('_')
                    pair_interval = pbt.Interval(fields[0], int(fields[1]), int(fields[2]))
                    pairbt = _split_pairs(pair_interval, binsize)
                    covdf_names = 'chr start end items bp total frac'.split()
                    covdf = pairbt.coverage(hits).to_dataframe(names=covdf_names)
                    values.append(covdf.bp.sum() / covdf.total.sum())
                else:
                    values.append(0)

    else:
        from sys import exit
        exit('INPUT ERROR: --action {0} not recognized.'.format(action))

    return values
Beispiel #10
0
def plot_multiple_regions_coverage(samples, out_file, region_bed=None, stem_bed=None):
    """
    given a list of bcbio samples and a bed file or BedTool of regions,
    makes a plot of the coverage in the regions for the set of samples

    if given a bed file or BedTool of locations in stem_bed with a label,
    plots lollipops at those locations
    """
    mpl.use('Agg', force=True)
    PAD = 100
    if file_exists(out_file):
        return out_file
    in_bams = [dd.get_align_bam(x) for x in samples]
    samplenames = [dd.get_sample_name(x) for x in samples]
    if isinstance(region_bed, six.string_types):
        region_bed = pybedtools.BedTool(region_bed)
    if isinstance(stem_bed, six.string_types):
        stem_bed = pybedtools.BedTool(stem_bed)
    if stem_bed is not None:  # tabix indexed bedtools eval to false
        stem_bed = stem_bed.tabix()
    plt.clf()
    plt.cla()
    with file_transaction(out_file) as tx_out_file:
        with backend_pdf.PdfPages(tx_out_file) as pdf_out:
            sns.despine()
            for line in region_bed:
                for chrom, start, end in _split_regions(line.chrom, max(line.start - PAD, 0),
                                                        line.end + PAD):
                    df = _combine_regional_coverage(in_bams, samplenames, chrom,
                                                    start, end, os.path.dirname(tx_out_file))
                    plot = sns.tsplot(df, time="position", unit="chrom",
                                      value="coverage", condition="sample")
                    if stem_bed is not None:  # tabix indexed bedtools eval to false
                        interval = pybedtools.Interval(chrom, start, end)
                        _add_stems_to_plot(interval, stem_bed, samples, plot)
                    plt.title("{chrom}:{start}-{end}".format(**locals()))
                    pdf_out.savefig(plot.get_figure())
                    plt.close()
    return out_file
Beispiel #11
0
def build_chr2_ins(feature,thr_top=0.15):
    sc_chr2_str=feature.fields[6]
    if sc_chr2_str==".":
        return []
    sub_str=map(lambda x:[x.split(";")[0],map(int,x.split(";")[1:])],sc_chr2_str.split(","))
    chr2_dict={}
    for chr2,poses in sub_str:
        if chr2 not in chr2_dict:
            chr2_dict[chr2]=[]
        chr2_dict[chr2].append(poses)

    chr2_dict={k:[sum(map(lambda x:x[0],v)),min(map(lambda x:x[1],v)),max(map(lambda x:x[2],v))] for k,v in chr2_dict.iteritems()}
    sorted_chr2=sorted(chr2_dict.items(),key=lambda x: x[1][0],reverse=True)
    n_reads=sum(map(lambda x:x[1][0],sorted_chr2))
    top_chr2s=filter(lambda x: x[1][0]>(thr_top*n_reads) and x[0] not in ["-1",feature.chrom],sorted_chr2)
    if not top_chr2s:
        return []    
    ctx_intervals=[]
    for chr2,[cnt,start,end] in top_chr2s:
        ctx_intervals.append(pybedtools.Interval(chr2, start, end, 
                               name=feature.name,score=feature.score))
    return ctx_intervals
Beispiel #12
0
def test_indexing():
    """
    Indexing into BedTools
    """
    a = pybedtools.example_bedtool('a.bed')

    # This is the first line
    interval = pybedtools.Interval('chr1', 1, 100, 'feature1', '0', '+')

    # just to make sure
    assert interval == iter(a).next()

    # test slice behavior
    results = list(a[0:2])
    assert len(results) == 2
    assert results[0] == interval

    # test single-integer indexing
    assert a[0] == interval

    # only slices and integers allowed....
    assert_raises(ValueError, a.__getitem__, 'key')
Beispiel #13
0
def find_itx(feature, wiggle):
    n = len(feature.fields) / 2
    start_idp1 = feature.start
    end_idp1 = feature.end
    start_idp2 = int(feature.fields[n + 1])
    end_idp2 = int(feature.fields[n + 2])
    dist_ends = [abs(start_idp1 - start_idp2), abs(end_idp1 - end_idp2)]
    if min(dist_ends) > wiggle:
        return None
    del_pos1 = int(feature.fields[6])
    del_pos2 = int(feature.fields[n + 6])
    if abs(del_pos1 - del_pos2) > wiggle:
        return None

    del_interval1 = map(int, feature.fields[7].split("-"))
    del_interval2 = map(int, feature.fields[n + 7].split("-"))
    lr_1 = 1 if abs(del_pos1 - del_interval1[0]) < abs(del_pos1 -
                                                       del_interval1[1]) else 0
    lr_2 = 1 if abs(del_pos2 - del_interval2[0]) < abs(del_pos2 -
                                                       del_interval2[1]) else 0
    if lr_1 == lr_2 or lr_2 < lr_1:
        return None

    del_id_2 = feature.name.split(",")[-1]
    del_filter_2 = feature.score.split(",")[-1]
    name = "%s,%s" % (feature.name, del_id_2)
    score = "%s,%s" % (feature.score, del_filter_2)

    return pybedtools.Interval(feature.chrom,
                               feature.start,
                               feature.end,
                               name=name,
                               score=score,
                               otherfields=[
                                   "%d" % ((del_pos1 + del_pos2) / 2),
                                   "%d-%d,%d-%d" %
                                   (del_interval1[0], del_interval1[1],
                                    del_interval2[0], del_interval2[1])
                               ])
Beispiel #14
0
    def get_wg_coverage(self):
        """Generator that takes as input a sorted bam and a merged bam of the circles in the whole genome and returns a numpy
        array for every interval with the coverage"""



        reference_contigs = self.bam.header['SQ']

        header_dict = {}
        for reference in reference_contigs:
            header_dict[reference['SN']] = reference['LN']

        merged_bed = self.bed.sort().merge()

        for interval in merged_bed:

            coverage_dict = {}
            if interval.start - self.ext < 0:
                start = 0

            else:
                start = interval.start - self.ext

            if header_dict[interval.chrom] < (interval.end + self.ext):
                end = interval.end + self.ext
            else:
                end = interval.end

            cov = self.bam.count_coverage(contig=interval.chrom, start=start, end=end, quality_threshold=self.mapq)
            summarized_cov = np.array([cov[0], cov[1], cov[2], cov[3]]).sum(axis=0)

            # save memory, convert to uint32.
            summ_cov = np.uint32(summarized_cov)

            print("Computing coverage on interval %s:%s-%s" % (interval.chrom,interval.start,interval.end))
            coverage_dict[bt.Interval(interval.chrom, start, end)] = summ_cov

            yield(coverage_dict,header_dict)
Beispiel #15
0
def merged_interval_features(feature, bam_handle):
    support_list = feature.name.split(",")
    locations = sorted(map(int, support_list[0::2]))
    num_unique_locations = len(set(locations))
    count_str = ",".join([
        "%s,%s" % (i, c) for (i, c) in collections.Counter(locations).items()
    ])
    plus_support = len([i for i in support_list[1::2] if i == "+"])
    minus_support = len(locations) - plus_support
    locations_span = max(locations) - min(locations)
    name = "%s,INS,0,SC,%d,%d,%d,%d,%s" % (base64.b64encode(json.dumps(
        dict())), plus_support, minus_support, locations_span,
                                           num_unique_locations, count_str)
    interval_readcount = bam_handle.count(reference=feature.chrom,
                                          start=feature.start,
                                          end=feature.end)

    return pybedtools.Interval(feature.chrom,
                               feature.start,
                               feature.end,
                               name=name,
                               score=feature.score,
                               otherfields=[str(interval_readcount)])
Beispiel #16
0
    def parallel_array_test(self):
        features = [(0, 20), (61, 81), (200, 220)]
        features = [pybedtools.Interval('chr2L', *i) for i in features]
        arr0 = self.m.array(features, bins=20, fragment_size=5)
        assert np.all(arr0 == np.array(
            [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
             [0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0],
             [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]]))

        arr1 = self.m.array(features,
                            bins=20,
                            fragment_size=5,
                            chunksize=1,
                            processes=PROCESSES)
        assert np.all(arr0 == arr1)

        # mix up the chunksize and processes
        arr2 = self.m.array(features,
                            bins=20,
                            fragment_size=5,
                            chunksize=3,
                            processes=PROCESSES)
        assert np.all(arr0 == arr2)

        # use more features and test for identity again
        features *= 1000
        print len(features)
        arr0 = self.m.array(features, bins=20, fragment_size=5)
        arr1 = self.m.array(features,
                            bins=20,
                            fragment_size=5,
                            chunksize=5,
                            processes=PROCESSES)
        print arr0.shape
        print arr1.shape
        print(arr0 != arr1)
        assert np.all(arr0 == arr1)
Beispiel #17
0
    def coords_to_bedtool(self, single_exon_coords):
        """Convert exon coordinates to bedtool intervals

        Assumes that the coordinates are in the exact same order as the
        original miso ids.

        Parameters
        ----------
        single_exon_coords : list
            List of (chrom, start, stop, strand) tuples of a single exon's
            coordinates

        Returns
        -------
        bedtool : pybedtools.BedTool
            A bedtool object of the exon intervals
        """
        if len(single_exon_coords) != len(self.miso_ids):
            raise ValueError("Number of coordinates must equal the number of "
                             "original miso ids")
        intervals = []
        for miso_id, exon in zip(self.miso_ids, single_exon_coords):
            chrom, start, stop, strand = exon

            # Base-0-ify
            start = int(start) - 1
            stop = int(stop)

            intervals.append(
                pybedtools.Interval(chrom,
                                    start,
                                    stop,
                                    strand=strand,
                                    name=miso_id,
                                    score='1000'))
        return pybedtools.BedTool(intervals)
Beispiel #18
0
def parse(BAM,BED): 

	'''
	Parse BAM and get overall and queries-specific statistics
	'''

	S_dict=dict()

	S_dict['BAM_PRIM']=0 #reads
	S_dict['BAM_SUPP']=0 #reads
	S_dict['BAM_SEC']=0 #reads
	S_dict['BAM_UNMAP']=0 #reads
	S_dict['BAM_ONTARGET']=0 #reads
	S_dict['BAM_OFFTARGET']=0 #reads
	S_dict['BAM_CSOFT_CLIP']=0 #bps
	S_dict['BAM_CMATCH']=0 #bps
	S_dict['BAM_CINS']=0 #bps
	S_dict['BAM_CDEL']=0 #bps
	S_dict['BAM_CDIFF']=0 #bps
	S_dict['BAM_LEN'] = [] #all lengths in list
	S_dict['BAM_QUAL'] = [] #all qualities in list
	S_dict['BAM_PID'] = [] #all PID in list

	bamfile=pysam.AlignmentFile(BAM, 'rb')
	
	try:
		
		bedfile=pybedtools.BedTool(BED)
		bedsrtd=bedfile.sort()

	except:

		now=datetime.now().strftime('%d/%m/%Y %H:%M:%S')
		print('[' + now + ']' + '[Error] Invalid BED file format')
		sys.exit(1)

	now=datetime.now().strftime('%d/%m/%Y %H:%M:%S')
	print('[' + now + ']' + '[Message] Parsing BAM file')

	ivf=bedsrtd.as_intervalfile()

	for read in bamfile.fetch():

		if read.has_tag('MD') and read.has_tag('NM'):

			if not read.is_unmapped:

				if not read.is_supplementary:

					if not read.is_secondary:

						S_dict['BAM_PRIM']+=1

						query=pybedtools.Interval(read.reference_name,read.reference_start,read.reference_end)
						
						if ivf.any_hits(query) >=1: #parse CIGAR only in targeted regions

							S_dict['BAM_ONTARGET']+=1

							Cdict=tuptodict(read.cigartuples)
							MD=read.get_aligned_pairs(with_seq=True)

							S_dict['BAM_CSOFT_CLIP']+=Cdict[4]
							S_dict['BAM_CINS']+=Cdict[1]
							S_dict['BAM_CDEL']+=Cdict[2]
							S_dict['BAM_CMATCH']+=sum(1 for x,y,z in MD if x is not None and z is not None and z[0].isupper())
							S_dict['BAM_CDIFF']+=sum(1 for x,y,z in MD if x is not None and z is not None and z[0].islower())
							S_dict['BAM_QUAL'].append(np.mean(read.query_qualities))

							NM=read.get_tag('NM')
							refcoords=read.get_reference_positions()
							reflen=refcoords[-1]-refcoords[0]#reference spans from first aligned to last aligned
							seqlen=len(read.get_reference_positions(full_length=True))#this is the read length
							S_dict['BAM_LEN'].append(seqlen)
							PID=100-100*NM/max(reflen,seqlen)
							S_dict['BAM_PID'].append(PID)

						else:

							S_dict['BAM_OFFTARGET']+=1

					else:

						S_dict['BAM_SEC']+=1

				else:

					S_dict['BAM_SUPP']+=1

			else:

				S_dict['BAM_UNMAP']+=1

		else:

			now=datetime.now().strftime('%d/%m/%Y %H:%M:%S')
			print('[Error] BAM misses the required MD/NM tags')
			bamfile.close()
			sys.exit(1)

	#calculate coverage in regions from BED

	for query in bedsrtd:

		key=query.chrom+':'+str(query.start)+'-'+str(query.end)
		now=datetime.now().strftime('%d/%m/%Y %H:%M:%S')
		print('[' + now + ']' + '[Message] Calculating coverage in region ' + key)
		query_arr=bamfile.count_coverage(query.chrom,query.start,query.end,quality_threshold=0, read_callback=check_read)
		perbasecov=np.sum(query_arr,axis=0).tolist()
		S_dict[key]=perbasecov

	bamfile.close()

	return S_dict
Beispiel #19
0
def split_region(work,
                 region_bed_file,
                 num_splits,
                 max_region=1000000,
                 min_region=20,
                 shuffle_intervals=False):

    logger.info("-----------------------------------------------------------")
    logger.info("Split region")
    logger.info("-----------------------------------------------------------")

    regions_bed = pybedtools.BedTool(region_bed_file).sort().merge(d=0)
    intervals = []
    for region in regions_bed:
        chrom, start, end = region.chrom, region.start, region.end
        if region.length + 1 > max_region:
            for i in range(start, end + 1, max_region):
                intervals.append(
                    pybedtools.Interval(chrom, i, min(end,
                                                      i + max_region - 1)))
        else:
            intervals.append(region)
    if shuffle_intervals:
        shuffle(intervals)
    regions_bed = pybedtools.BedTool(intervals)
    total_len = sum(map(lambda x: int(x[2]) - int(x[1]) + 1, regions_bed))
    logger.info("Total length: {}".format(total_len))
    split_len = total_len / num_splits
    split_regions = []
    current_regions = []
    sofar_len = 0
    current_len = 0
    split_lens = []
    for region in regions_bed:
        chrom, start, end = region[0:3]
        start, end = int(start), int(end)
        s = start
        e = -1
        while (current_len < split_len):
            s = max(s, e + 1)
            e = min(s + split_len - current_len - 1, end)
            if (e - s + 1) < 2 * min_region:
                e = min(s + 2 * min_region - 1, end)
            if (end - e) < 2 * min_region:
                e = end
            current_regions.append(pybedtools.Interval(chrom, s, e))
            current_len += e - s + 1
            if (current_len >= split_len):
                sofar_len += current_len
                split_lens.append(current_len)
                current_len = 0
                split_regions.append(current_regions)
                current_regions = []
                if split_len < (total_len - sofar_len) < 1.5 * split_len:
                    split_len = total_len - sofar_len
            if e >= end:
                break
    if current_regions:
        split_lens.append(current_len)
        split_regions.append(current_regions)
    split_region_files = []
    for i, split_region_ in enumerate(split_regions):
        split_region_file = os.path.join(work, "region_{}.bed".format(i))
        pybedtools.BedTool(split_region_).saveas(split_region_file)
        logger.info("Split {}: {}".format(i, split_lens[i]))
        split_region_files.append(split_region_file)
    sum_len = sum(split_lens)
    logger.info("Total splitted length: {}".format(sum_len))
    return split_region_files
Beispiel #20
0
giremi_to_vcf(editor_pred, vcf_file)
editor_bed = vcf_to_bed(vcf_file, all_otherfields=True)
cmd = "java -jar  %s vcfcompare -true_vcf %s -prefix %s.NISTHCnonDB %s" % (
    varsim_jar, NIST_HC_nonDB, pred_file, vcf_file)
if not os.path.exists("%s.NISTHCnonDB_TP.vcf" % (pred_file)):
    a = os.system(cmd)
    print cmd
    if a != 0:
        print a

# In[17]:

pred_edited = {}
edit_bed = pybedtools.BedTool([
    pybedtools.Interval(x[1],
                        int(x[2]) - 1, int(x[2]), x[17],
                        find_er(x[17][0], x[17][1], x[3], x[18:22]))
    for x in editor_pred if int(x[22]) > 0
])
for region, region_bed in [["Alu", Alu_regions],
                           ["nonAlu-reps", nonAlu_rep_regions],
                           ["nonreps", ""], ["all", ""]]:
    if region in ["Alu", "nonAlu-reps"]:
        my_edit_bed = edit_bed.window(region_bed, w=0, u=True)
    elif region == "nonreps":
        my_edit_bed = edit_bed.window(Alu_regions, w=0, v=True)
        my_edit_bed = my_edit_bed.window(nonAlu_rep_regions, w=0, v=True)
    elif region == "all":
        my_edit_bed = edit_bed.sort()
    edit_types = [x[3] for x in my_edit_bed]
    edit_ratios = [x[4] for x in my_edit_bed]
def get_insertion_breakpoints(age_records, intervals, window=20, start=0):
    func_logger = logging.getLogger("%s-%s" %
                                    (get_insertion_breakpoints.__name__,
                                     multiprocessing.current_process()))
    bedtools_intervals = [
        pybedtools.Interval("1", interval[0], interval[1])
        for interval in sorted(intervals)
    ]
    func_logger.info("bedtools_intervals %s" % (str(bedtools_intervals)))
    if not bedtools_intervals:
        return []

    potential_breakpoints = sorted(
        list(
            set([interval.start for interval in bedtools_intervals] +
                [interval.end for interval in bedtools_intervals])))

    breakpoints = []
    for breakpoint in potential_breakpoints[1:-1]:
        # Check if the breakpoint is within window distance of a validated breakpoint
        if min([window + 1] + [abs(b[0] - breakpoint)
                               for b in breakpoints]) <= window:
            continue
        func_logger.info("\tExamining potential breakpoint %d for support" %
                         breakpoint)
        left_support = [
            interval[0] for interval in intervals
            if abs(interval[0] - breakpoint) <= window
        ]
        right_support = [
            interval[1] for interval in intervals
            if abs(interval[1] - breakpoint) <= window
        ]
        counter_examples = [
            age_record for age_record in age_records
            if age_record.has_long_ref_flanks() and (
                age_record.has_ref_deletion(window)
                or age_record.has_insertion(min_diff=1, max_diff=49))
            and age_record.breakpoint_match(breakpoint, window)
        ]
        if counter_examples:
            counter_example_ends = [
                age_record.start1_end1s for age_record in counter_examples
            ]
            func_logger.info("\t\tSkipping breakpoint %d due to %s" %
                             (breakpoint, str(counter_example_ends)))
            continue

        if left_support:
            func_logger.info("\t\tLeft support %s" % (str(left_support)))
        if right_support:
            func_logger.info("\t\tRight support %s" % (str(right_support)))

        if (left_support and right_support) and min(
            [window + 1] + [abs(b[0] - breakpoint)
                            for b in breakpoints]) > window:
            both_support = [
                age_record for age_record in age_records
                if age_record.has_insertion(min_diff=50, max_diff=1000000000)
                and age_record.breakpoint_match(breakpoint, window)
            ]
            if both_support:
                func_logger.info("\t\tboth_support = %s" % (str(both_support)))
                func_logger.info("\t\tinsertion lengths = %s" % (str([
                    age_record.insertion_length()
                    for age_record in both_support
                ])))
            insertion_length = max(
                [0] +
                [age_record.insertion_length() for age_record in both_support])
            func_logger.info("\t\tInsertion length = %d" % insertion_length)
            breakpoints.append((breakpoint, insertion_length))

    func_logger.info("Gathered breakpoints as %s" % (str(breakpoints)))

    return [(start + b[0], b[1]) for b in breakpoints]
Beispiel #22
0
 def segments_as_bedtool_intervals(self, segments, name='.'):
     yield from (pybedtools.Interval(s.contig,
                                     s.start,
                                     s.stop,
                                     strand=s.strand,
                                     name=name) for s in segments)
Beispiel #23
0
def CR(stranded_bam, ivf):
    '''
	Extract Crick reads from strand-seq BAM. Switch Watson and Crick if hit in ivf
	'''

    CR = defaultdict(lambda: [None, None])

    for read in stranded_bam.fetch(until_eof=True):

        if read.is_proper_pair and not read.is_secondary and not read.is_supplementary:

            if ivf is None:  #no need to check for intervals match

                if (read.is_read1 and read.is_reverse) or (
                        read.is_read2 and
                        not read.is_reverse):  #read2 forward and read1 reverse

                    read.set_tag('OS', 'C', 'Z')  #used for debugging

                    if read.query_name not in CR:

                        if read.is_read1:

                            CR[read.query_name][0] = read

                        else:

                            CR[read.query_name][1] = read

                    else:

                        if read.is_read1:

                            yield read, CR[read.query_name][1]

                        else:

                            yield CR[read.query_name][0], read

                        del CR[read.query_name]

            else:  #there is a region to perform W-C switch in

                query = pybedtools.Interval(read.reference_name,
                                            read.reference_start,
                                            read.reference_end)

                if ivf.any_hits(query) >= 1:  #yeld Watson as Crick

                    if (read.is_read1
                            and not read.is_reverse) or (read.is_read2
                                                         and read.is_reverse):

                        read.set_tag('OS', 'W', 'Z')  #used for debugging

                        if read.query_name not in CR:

                            if read.is_read1:

                                CR[read.query_name][0] = read

                            else:

                                CR[read.query_name][1] = read

                        else:

                            if read.is_read1:

                                yield read, CR[read.query_name][1]

                            else:

                                yield CR[read.query_name][0], read

                            del CR[read.query_name]

                else:  #Classic Crick Read

                    if (read.is_read1 and read.is_reverse) or (
                            read.is_read2 and not read.is_reverse
                    ):  #read2 forward and read1 reverse

                        read.set_tag('OS', 'C', 'Z')  #used for debugging

                        if read.query_name not in CR:

                            if read.is_read1:

                                CR[read.query_name][0] = read

                            else:

                                CR[read.query_name][1] = read

                        else:

                            if read.is_read1:

                                yield read, CR[read.query_name][1]

                            else:

                                yield CR[read.query_name][0], read

                            del CR[read.query_name]
Beispiel #24
0
    def select_read_pair_one_overlap_TE_annot(self,
                                              TE_annot,
                                              int_size,
                                              min_mapq,
                                              db,
                                              bin_size=50000000):
        """ output bam file of read pairs where exactly one read overlaps with an annotation in supplied gff file\
            also returns a BedTools object """

        # print "selecting discordant reads that overlap with a TE in annotation " + TE_annot + " ..."

        #use pysam to open the bam file because it has better object definition for the reads
        valid_discordant_bam = pysam.Samfile(self.bam_file_name, "rb")

        #file to save the discordant read pairs where exactly one read overlaps a TE annotation
        #overlap_TE_bam_file = pysam.Samfile(self.prefix + ".one_read_overlap_TE.bam", mode="wb", referencenames=valid_discordant_bam.references, referencelengths=valid_discordant_bam.lengths)

        #use pybedtools to look up the overlap of the Interval defined by the read with the Intervals defined by the gff file
        TE_annot_intervals = pybedtools.IntervalFile(TE_annot)

        #make a list of AlignedReadPair objects for each read pair in the list that has exactly one read overlapping a TE
        read_pairs_xor_overlap_TE = []
        print '######'
        print db
        print '######'
        read_pair_database = y_serial.Main(db)
        bin_list = list()

        try:
            read1 = valid_discordant_bam.next()
            read2 = valid_discordant_bam.next()
        except StopIteration:
            print "ERROR: no reads are found in %s, exiting" % (bam_file_name)
            sys.exit(2)

        while 1:
            #if verbose:
            #    print read1
            #    print read2

            #check that the reads are truly a pair:
            #if not, scoot down one in the iteration

            if read1.qname != read2.qname:
                print "unmatched pair in valid discordant reads. Problem!!"
                #sys.exit(2)
                read1 = read2
                try:
                    read2 = valid_discordant_bam.next()
                except StopIteration:
                    break
                continue

            read_pair = AlignedReadPair(read1, read2)

            #see if read1 is a TE
            read1_all_mappings = get_all_mapping_pos(read1,
                                                     valid_discordant_bam)
            for (chr, start, end) in read1_all_mappings:
                map_interval = pybedtools.Interval(chr, start, end, strand='+')
                overlapping_TE_annots = TE_annot_intervals.all_hits(
                    map_interval)
                if len(overlapping_TE_annots) > 0:
                    read_pair.TE_annot_attr_list.extend([
                        gff_interval.attrs
                        for gff_interval in overlapping_TE_annots
                    ])
                    read_pair.TE_map_gff_list.extend([
                        str(gff_interval)
                        for gff_interval in overlapping_TE_annots
                    ])
                    read_pair.read1_is_TE = True
                    #print "read1 TE"
                    #if read1 is TE, then read2 is the anchor, so set the interval chr to the chr of read2
                    read_pair.interval_chr = valid_discordant_bam.getrname(
                        read2.rname)

            #see if read2 is a TE
            read2_all_mappings = get_all_mapping_pos(read2,
                                                     valid_discordant_bam)
            for (chr, start, end) in read2_all_mappings:
                map_interval = pybedtools.Interval(chr, start, end, strand='+')
                overlapping_TE_annots = TE_annot_intervals.all_hits(
                    map_interval)
                if len(overlapping_TE_annots) > 0:
                    read_pair.TE_annot_attr_list.extend([
                        gff_interval.attrs
                        for gff_interval in overlapping_TE_annots
                    ])
                    read_pair.TE_map_gff_list.extend([
                        str(gff_interval)
                        for gff_interval in overlapping_TE_annots
                    ])
                    read_pair.read2_is_TE = True
                    #print "read2 TE"
                    #if read2 is TE, then read1 is the anchor, so set the interval chr to the chr of read1
                    read_pair.interval_chr = valid_discordant_bam.getrname(
                        read1.rname)

            #only add the AlignedRead to the list if exactly one read maps to a TE location, and the anchor is not repetitive

            if read_pair.read1_is_TE and not read_pair.read2_is_TE and not is_mapped_mult_times(
                    read2):
                if min_mapq:
                    if read2.mapq >= min_mapq:

                        read_pair.calculate_outside_interval(
                            int_size, read1, read2)
                        read_pair.calc_anchor_is_softclipped(read1, read2)

                        ###### TODO: this is where the Aligned ReadPair objects are deleted

                        #read_pair.read1 = None
                        #read_pair.read2 = None

                        #print read_pair.read1

                        #############
                        #read_pairs_xor_overlap_TE.append(read_pair)
                        tabname = "[c%s_%d_%d_%s]" % (
                            read_pair.interval_chr, bin_size *
                            (int(read_pair.interval_start) / bin_size),
                            bin_size *
                            (1 + int(read_pair.interval_start) / bin_size),
                            read_pair.interval_direction)
                        #read_pair_database.insert(read_pair,'',tabname)
                        read_pairs_xor_overlap_TE.append([read_pair, tabname])
                        ##overlap_TE_bam_file.write(read_pair.read1)
                        ##overlap_TE_bam_file.write(read_pair.read2)
                else:
                    read_pair.calculate_outside_interval(
                        int_size, read1, read2)
                    read_pair.calc_anchor_is_softclipped(read1, read2)
                    #read_pairs_xor_overlap_TE.append(read_pair)
                    tabname = "[c%s_%d_%d_%s]" % (
                        read_pair.interval_chr, bin_size *
                        (int(read_pair.interval_start) / bin_size), bin_size *
                        (1 + int(read_pair.interval_start) / bin_size),
                        read_pair.interval_direction)
                    #read_pair_database.insert(read_pair,'',tabname)
                    read_pairs_xor_overlap_TE.append([read_pair, tabname])

            elif read_pair.read2_is_TE and not read_pair.read1_is_TE and not is_mapped_mult_times(
                    read1):
                if min_mapq:
                    if read1.mapq >= min_mapq:

                        read_pair.calculate_outside_interval(
                            int_size, read1, read2)
                        read_pair.calc_anchor_is_softclipped(read1, read2)

                        #read_pairs_xor_overlap_TE.append(read_pair)
                        tabname = "[c%s_%d_%d_%s]" % (
                            read_pair.interval_chr, bin_size *
                            (int(read_pair.interval_start) / bin_size),
                            bin_size *
                            (1 + int(read_pair.interval_start) / bin_size),
                            read_pair.interval_direction)

                        #read_pair_database.insert(read_pair,'',tabname)
                        read_pairs_xor_overlap_TE.append([read_pair, tabname])
                        ##overlap_TE_bam_file.write(read_pair.read1)
                        ##overlap_TE_bam_file.write(read_pair.read2)
                else:
                    read_pair.calculate_outside_interval(
                        int_size, read1, read2)
                    read_pair.calc_anchor_is_softclipped(read1, read2)

                    #read_pairs_xor_overlap_TE.append(read_pair)
                    tabname = "[c%s_%d_%d_%s]" % (
                        read_pair.interval_chr, bin_size *
                        (int(read_pair.interval_start) / bin_size), bin_size *
                        (1 + int(read_pair.interval_start) / bin_size),
                        read_pair.interval_direction)
                    #read_pair_database.insert(read_pair,'',tabname)
                    read_pairs_xor_overlap_TE.append([read_pair, tabname])
            if len(read_pairs_xor_overlap_TE) >= 100000:
                read_pair_database.ingenerator(read_pairs_xor_overlap_TE,
                                               'read_pairs')
                #try to get the unique
                tmp = list()
                tmp = [bin_ for pair, bin_ in read_pairs_xor_overlap_TE]
                bin_list.extend(list(set(tmp)))
                del read_pairs_xor_overlap_TE
                read_pairs_xor_overlap_TE = list()

            #shift to next pair
            try:
                read1 = valid_discordant_bam.next()
                read2 = valid_discordant_bam.next()
            except StopIteration:
                break
        if len(read_pairs_xor_overlap_TE) > 0:
            read_pair_database.ingenerator(read_pairs_xor_overlap_TE,
                                           'read_pairs')
            #try to get the unique
            tmp = list()
            tmp = [bin_ for pair, bin_ in read_pairs_xor_overlap_TE]
            bin_list.extend(list(set(tmp)))
            del read_pairs_xor_overlap_TE
            read_pairs_xor_overlap_TE = list()
        bin_list = list(set(bin_list))
        read_pair_database.insert(bin_list, '', 'bin_list')
        print bin_list
        print "number discordant read pairs with exactly one read overlapping a TE: %d" % len(
            read_pairs_xor_overlap_TE)
        #print "\n".join(pair.str() for pair in read_pairs_xor_overlap_TE)
        #overlap_TE_bam_file.close()

        valid_discordant_bam.close()
        return read_pairs_xor_overlap_TE
Beispiel #25
0
def _local_coverage(reader,
                    features,
                    read_strand=None,
                    fragment_size=None,
                    shift_width=0,
                    bins=None,
                    use_score=False,
                    accumulate=True,
                    preserve_total=False,
                    method=None,
                    function="mean",
                    zero_inf=True,
                    zero_nan=True,
                    processes=None,
                    stranded=True,
                    verbose=False):
    """
    Returns a binned vector of coverage.

    Computes a 1D vector of coverage at the coordinates for each feature in
    `features`, extending each read by `fragmentsize` bp.

    Some arguments cannot be used for bigWig files due to the structure of
    these files.  The parameters docstring below indicates whether or not an
    argument can be used with bigWig files.

    Depending on the arguments provided, this method can return a vector
    containing values from a single feature or from concatenated features.

    An example of the flexibility afforded by the latter case:

        `features` can be a 3-tuple of pybedtools.Intervals representing (TSS
        + 1kb upstream, gene, TTS + 1kb downstream) and `bins` can be [100,
        1000, 100].  This will return a vector of length 1200 containing the
        three genomic intervals binned into 100, 1000, and 100 bins
        respectively.  Note that is up to the caller to construct the right
        axes labels in the final plot!

    Parameters
    ----------
    features : str, interval-like object, or list

        Can be a single interval or an iterable yielding intervals.

        Interval-like objects must have chrom, start, and stop attributes, and
        optionally a strand attribute.  One exception to this that if
        `features` is a single string, it can be of the form "chrom:start-stop"
        or "chrom:start-stop[strand]".

        If `features` is a single interval, then return a 1-D array for that
        interval.

        If `features` is an iterable of intervals, then return a 1-D
        array that is a concatenation of signal for these intervals.

        Available for bigWig.

    bins : None, int, list
        If `bins` is None, then each value in the returned array will
        correspond to one bp in the genome.

        If `features` is a single Interval, then `bins` is an integer or None.

        If `features` is an iterable of Intervals, `bins` is an iterable of
        integers of the same length as `features`.

        Available for bigWig.

    fragment_size : None or int
        If not None, then each item from the genomic signal (e.g., reads from
        a BAM file) will be extended `fragment_size` bp in the 3' direction.
        Higher fragment sizes will result in smoother signal.  Not available
        for bigWig.

    shift_width : int
        Each item from the genomic signal (e.g., reads from a BAM
        file) will be shifted `shift_width` bp in the 3' direction.  This can
        be useful for reconstructing a ChIP-seq profile, using the shift width
        determined from the peak-caller (e.g., modeled `d` in MACS). Not
        available for bigWig.

    read_strand : None or str
        If `read_strand` is one of "+" or "-", then only items from the genomic
        signal (e.g., reads from a BAM file) on that strand will be considered
        and reads on the opposite strand ignored.  Useful for plotting genomic
        signal for stranded libraries. Not available for bigWig.

    stranded : bool
        If True, then the profile will be reversed for features whose strand
        attribute is "-".

    use_score : bool
        If True, then each bin will contain the sum of the *score* attribute of
        genomic features in that bin instead of the *number* of genomic
        features falling within each bin. Not available for bigWig.

    accumulate : bool
        If False, then only record *that* there was something there, rather
        than acumulating reads.  This is useful for making matrices with called
        peaks. Available for bigWig.

    preserve_total : bool
        If True, re-scales the returned value so that each binned row's total
        is equal to the sum of the original, un-binned data.  The units of the
        returned array will be in "total per bin".  This is useful for, e.g.,
        counting reads in features.  If `preserve_total` is False, then the
        returned array will have units of "density"; this is more generally
        useful and is the default behavior.  Available for bigWig, but not when
        using method="ucsc_summarize".

    method : str; one of [ "summarize" | "get_as_array" | "ucsc_summarize" ]
        Only used for bigWig.  The method specifies how data are extracted from
        the bigWig file.  "summarize" is the default.  It's quite fast, but may
        yield slightly different results when compared to running this same
        function on the BAM file from which the bigWig was created.

        "summarize" uses bx-python.  The values returned will not be exactly
        the same as the values returned when local_coverage is called on a BAM,
        BED, or bigBed file, but they will be close.  This method is quite
        fast, and is the default when bins is not None.

        "get_as_array" uses bx-python, but does a separate binning step.  This
        can be slower than the other two methods, but the results are exactly
        the same as those from a BAM, BED, or bigBed file.  This method is
        always used if bins=None.

        "ucsc_summarize" is an alternative version of "summarize".  It uses the
        UCSC program `bigWigSummary`, which must already installed and on your
        path.

    function : str; one of ['sum' | 'mean' | 'min' | 'max' | 'std']
        Determine the nature of the values returned. Only valid if `method` is
        "summarize" or "ucsc_summarize", which also implies bigWig. Default is
        "mean". If `method="ucsc_summarize", then there is an additional option
        for function, "coverage", which returns the percent of region that is
        covered.

    zero_inf, zero_nan : bool
        Only used for bigWig. If either are True, sets any missing or inf
        values to zero before returning.

        If `method="ucsc_summarize"`, missinv values are always reported as
        zero. If `method="get_as_array"`, missing values always reported as
        nan.

        Values can be -inf, inf, or nan for missing values when
        `method="summarize"` according to the following table:

        ========== ========================
        `function` missing values appear as
        ========== ========================
        "sum"      0
        "mean"     nan
        "min"      inf
        "max"      -inf
        "std"      nan
        ========== ========================

    processes : int or None
        The feature can be split across multiple processes.

    Returns
    -------

    1-d NumPy array


    Notes
    -----
    If a feature has a "-" strand attribute, then the resulting profile will be
    *relative to a minus-strand feature*.  That is, the resulting profile will
    be reversed.

    Returns arrays `x` and `y`.  `x` is in genomic coordinates, and `y` is
    the coverage at each of those coordinates after extending fragments.

    The total number of reads is guaranteed to be the same no matter how it's
    binned.

    (with ideas from
    http://www-huber.embl.de/users/anders/HTSeq/doc/tss.html)

    """
    # bigWig files are handled differently, so we need to know if we're working
    # with one; raise exeception if a kwarg was supplied that's not supported.
    if isinstance(reader, filetype_adapters.BigWigAdapter):
        is_bigwig = True
        defaults = (
            ('read_strand', read_strand, None),
            ('fragment_size', fragment_size, None),
            ('shift_width', shift_width, 0),
            ('use_score', use_score, False),
            ('preserve_total', preserve_total, False),
        )
        for name, check, default in defaults:
            if (((default is None) and (check is not default))
                    or (check != default)):
                raise ArgumentError("Argument '%s' not supported for bigWig" %
                                    name)

        if method == 'ucsc_summarize':
            if preserve_total:
                raise ArgumentError(
                    "preserve_total=True not supported when using "
                    "method='ucsc_summarize'")
    else:
        is_bigwig = False

    if isinstance(reader, filetype_adapters.BamAdapter):
        if use_score:
            raise ArgumentError("Argument 'use_score' not supported for "
                                "bam")

    # e.g., features = "chr1:1-1000"
    if isinstance(features, basestring):
        features = helpers.tointerval(features)

    if not ((isinstance(features, list) or isinstance(features, tuple))):
        if bins is not None:
            if not isinstance(bins, int):
                raise ArgumentError("bins must be an int, got %s" % type(bins))
        features = [features]
        bins = [bins]
    else:
        if bins is None:
            bins = [None for i in features]
        if not len(bins) == len(features):
            raise ArgumentError("bins must have same length as feature list")

    # nomenclature:
    #   "window" is region we're getting data for
    #   "alignment" is one item in that region
    #
    profiles = []
    xs = []
    for window, nbin in zip(features, bins):
        window = helpers.tointerval(window)
        chrom = window.chrom
        start = window.start
        stop = window.stop
        strand = window.strand

        if not is_bigwig:
            # Extend the window to catch reads that would extend into the
            # requested window
            _fs = fragment_size or 0
            padded_window = pybedtools.Interval(
                chrom,
                max(start - _fs - shift_width, 0),
                stop + _fs + shift_width,
            )
            window_size = stop - start

            # start off with an array of zeros to represent the window
            profile = np.zeros(window_size, dtype=float)

            for interval in reader[padded_window]:

                if read_strand:
                    if interval.strand != read_strand:
                        continue

                # Shift interval by modeled distance, if specified.
                if shift_width:
                    if interval.strand == '-':
                        interval.start -= shift_width
                        interval.stop -= shift_width
                    else:
                        interval.start += shift_width
                        interval.stop += shift_width

                # Extend fragment size from 3'
                if fragment_size:
                    if interval.strand == '-':
                        interval.start = interval.stop - fragment_size
                    else:
                        interval.stop = interval.start + fragment_size

                # Convert to 0-based coords that can be used as indices into
                # array
                start_ind = interval.start - start

                # If the feature goes out of the window, then only include the
                # part that's inside the window
                start_ind = max(start_ind, 0)

                # Same thing for stop
                stop_ind = interval.stop - start
                stop_ind = min(stop_ind, window_size)

                # Skip if the feature is shifted outside the window. This can
                # happen with large values of `shift_width`.
                if start_ind >= window_size or stop_ind < 0:
                    continue

                # Finally, increment profile
                if use_score:
                    score = float(interval.score)
                else:
                    score = 1

                if accumulate:
                    if preserve_total:
                        profile[start_ind:stop_ind] += (score / float(
                            (stop_ind - start_ind)))
                    else:
                        profile[start_ind:stop_ind] += score

                else:
                    profile[start_ind:stop_ind] = score

        else:  # it's a bigWig
            profile = reader.summarize(
                window,
                method=method,
                function=function,
                bins=(nbin or len(window)),
                zero_inf=zero_inf,
                zero_nan=zero_nan,
            )

        # If no bins, return genomic coords
        if (nbin is None):
            x = np.arange(start, stop)

        # Otherwise do the downsampling; resulting x is stll in genomic
        # coords
        else:
            if preserve_total:
                total = float(profile.sum())
            if not is_bigwig or method == 'get_as_array':
                xi, profile = rebin(x=np.arange(start, stop),
                                    y=profile,
                                    nbin=nbin)
                if not accumulate:
                    nonzero = profile != 0
                    profile[profile != 0] = 1
                x = xi

            else:
                x = np.linspace(start, stop - 1, nbin)

        # Minus-strand profiles should be flipped left-to-right.
        if stranded and strand == '-':
            profile = profile[::-1]
        xs.append(x)
        if preserve_total:
            scale = profile.sum() / total
            profile /= scale
        profiles.append(profile)

    stacked_xs = np.hstack(xs)
    stacked_profiles = np.hstack(profiles)
    del xs
    del profiles
    return stacked_xs, stacked_profiles
def scan_alignments(work,
                    scan_alignments_binary,
                    input_bam,
                    regions_bed_file,
                    reference,
                    num_threads,
                    window_size,
                    maf,
                    min_mapq,
                    max_dp,
                    restart=True,
                    split_region_files=[],
                    calc_qual=True):

    logger = logging.getLogger(scan_alignments.__name__)

    logger.info("-------------------Scan Alignment BAM----------------------")

    if not split_region_files:
        if regions_bed_file:
            regions_bed = pybedtools.BedTool(regions_bed_file).sort().merge(
                d=0)
        else:
            intervals = []
            with pysam.AlignmentFile(input_bam, "rb") as samfile:
                for chrom, length in zip(samfile.references, samfile.lengths):
                    intervals.append(pybedtools.Interval(chrom, 1, length - 1))
            regions_bed = pybedtools.BedTool(intervals)
        if not os.path.exists(work):
            os.mkdir(work)
        total_len = sum(map(lambda x: int(x[2]) - int(x[1]) + 1, regions_bed))

        if not restart:
            split_region_files = glob.glob(os.path.join(work, "region_*.bed"))
            spilt_total_len = sum(
                map(lambda x: sum([y.length for y in pybedtools.BedTool(x)]),
                    split_region_files))
            if spilt_total_len >= 0.98 * total_len:
                split_region_files = sorted(
                    split_region_files,
                    key=lambda x: int(
                        os.path.basename(x).split(".bed")[0].split("_")[1]))
        if not split_region_files:
            regions_bed_file = os.path.join(work, "all_regions.bed")
            regions_bed.saveas(regions_bed_file)

            num_split = max(
                int(
                    np.ceil(
                        (total_len / 10000000) / num_threads) * num_threads),
                num_threads)
            split_region_files = split_region(work,
                                              regions_bed_file,
                                              num_split,
                                              min_region=window_size,
                                              max_region=1e20)
    else:
        logger.info(
            "split_regions to be used (will ignore region_bed): {}".format(
                " ".join(split_region_files)))

    map_args = []
    all_outputs = [[]] * len(split_region_files)
    not_done = []
    for i, split_region_file in enumerate(split_region_files):
        if restart or not os.path.exists(os.path.join(work, "work.{}".format(i), "region.bed")) \
                or not os.path.exists(os.path.join(work, "work.{}".format(i), "candidates.vcf")) \
                or not os.path.exists(os.path.join(work, "work.{}".format(i), "count.bed.gz")):
            work_ = os.path.join(work, "work.{}".format(i))
            if os.path.exists(work_):
                shutil.rmtree(work_)
            map_args.append(
                (os.path.join(work, "work.{}".format(i)), reference,
                 scan_alignments_binary, split_region_file, input_bam,
                 window_size, maf, min_mapq, max_dp, calc_qual, 1))
            not_done.append(i)
        else:
            all_outputs[i] = [
                os.path.join(work, "work.{}".format(i), "candidates.vcf"),
                os.path.join(work, "work.{}".format(i), "count.bed.gz"),
                os.path.join(work, "work.{}".format(i), "region.bed")
            ]

    pool = multiprocessing.Pool(num_threads)
    try:
        outputs = pool.map_async(run_scan_alignments, map_args).get()
        pool.close()
    except Exception as inst:
        pool.close()
        logger.error(inst)
        traceback.print_exc()
        raise Exception

    for o in outputs:
        if o is None:
            raise Exception("scan_alignments failed!")

    for i, output in zip(not_done, outputs):
        all_outputs[i] = output
    return all_outputs
Beispiel #27
0
def find_resolved_variants((chrom, start, end, variants, input_bam, reference)):
    thread_logger = logging.getLogger(
        "{} ({})".format(find_resolved_variants.__name__, multiprocessing.current_process().name))
    try:
        ref = pysam.FastaFile(reference)
        out_variants = []
        start, end = map(int, [start, end])
        region = [chrom, start, end]
        vartypes = map(lambda x: x[-1], variants)
        scores = map(lambda x: x[5], variants)
        if len(set(vartypes)) > 1:
            out_variants.extend(
                map(lambda x: [x[0], int(x[1]), x[3], x[4], x[10], x[5]], variants))
        else:
            vartype = vartypes[0]
            score = max(scores)
            if vartype == "DEL":
                intervals = []
                dels = []
                with pysam.AlignmentFile(input_bam) as samfile:
                    for record in samfile.fetch(chrom, start, end):
                        if record.cigarstring and "D" in record.cigarstring:
                            dels.extend(extract_del(record))
                dels = filter(lambda x: (
                    start <= x[1] <= end) or start <= x[2] <= end, dels)
                if dels:
                    intervals = map(lambda x: pybedtools.Interval(
                        x[0], x[1], x[2]), dels)
                    bed = pybedtools.BedTool(intervals)
                    del_strs = map(lambda x: "---".join(x[0:3]), bed)
                    uniq_dels = list(set(del_strs))
                    uniq_dels_count = {}
                    for del_ in uniq_dels:
                        uniq_dels_count[del_] = del_strs.count(del_)
                    max_count = max(uniq_dels_count.values())
                    for del_ in uniq_dels:
                        if uniq_dels_count[del_] <= max_count * 0.5:
                            del uniq_dels_count[del_]
                    new_bed = pybedtools.BedTool(map(lambda x: pybedtools.Interval(x[0], int(x[1]), int(x[2])),
                                                     map(lambda x: x.split("---"), uniq_dels_count.keys())))
                    new_bed = new_bed.sort().merge(c=[1], o="count")
                    out_variants.extend(map(lambda x: [x[0], int(x[1]), ref.fetch(x[0], int(
                        x[1]) - 1, int(x[2])), ref.fetch(x[0], int(x[1]) - 1, int(x[1])), "0/1", score], new_bed))
            elif vartype == "INS":
                intervals = []
                inss = []
                with pysam.AlignmentFile(input_bam) as samfile:
                    for record in samfile.fetch(chrom, start, end):
                        if record.cigarstring and "I" in record.cigarstring:
                            inss.extend(extract_ins(record))
                inss = filter(lambda x: (
                    start <= x[1] <= end) or start <= x[2] <= end, inss)
                if inss:
                    intervals = map(lambda x: pybedtools.Interval(
                        x[0], x[1], x[2], x[3]), inss)
                    bed = pybedtools.BedTool(intervals)
                    ins_strs = map(lambda x: "---".join(x[0:4]), bed)
                    uniq_inss = list(set(ins_strs))
                    uniq_inss_count = {}
                    for ins_ in uniq_inss:
                        uniq_inss_count[ins_] = ins_strs.count(ins_)
                    max_ins, max_count = sorted(
                        uniq_inss_count.items(), key=lambda x: x[1])[-1]
                    max_pos = int(max_ins.split("---")[1])
                    for ins_ in uniq_inss:
                        if uniq_inss_count[ins_] <= max_count * 0.5 or 0 < abs(int(ins_.split("---")[1]) - max_pos) < 4:
                            del uniq_inss_count[ins_]
                    new_bed = pybedtools.BedTool(map(lambda x: pybedtools.Interval(x[0], int(x[1]), int(x[2]), x[3]),
                                                     map(lambda x: x.split("---"), uniq_inss_count.keys()))).sort()
                    out_variants.extend(map(lambda x: [x[0], int(x[1]), ref.fetch(x[0], int(
                        x[1]) - 1, int(x[1])), ref.fetch(x[0], int(x[1]) - 1, int(x[1])) + x[3], "0/1", score], new_bed))
        return out_variants
    except Exception as ex:
        thread_logger.error(traceback.format_exc())
        thread_logger.error(ex)
        return None
Beispiel #28
0
def resolve_for_IDP_ITX_CTX(vcf_records,fasta_file,pad=0,wiggle=10,overlap_ratio=0.9):
    del_records = filter(lambda x: (x.INFO["SVTYPE"] == "DEL") ,vcf_records)
    dup_records = filter(lambda x: (x.INFO["SVTYPE"] == "DUP") ,vcf_records)    
    ins_records = filter(lambda x: (x.INFO["SVTYPE"] == "INS") ,vcf_records)    
    other_records = filter(lambda x: (x.INFO["SVTYPE"] not in ["DEL","DUP","INS"]),vcf_records)
    del_bedtool = pybedtools.BedTool([pybedtools.Interval(x.CHROM, x.POS, (x.POS+abs(x.INFO["SVLEN"])),
                                      name="DEL_%d"%i,score=x.FILTER[0]) for i,x in enumerate(del_records)])     
    dup_bedtool = pybedtools.BedTool([pybedtools.Interval(x.CHROM, x.POS, (x.POS+abs(x.INFO["SVLEN"])),
                                      name="DUP_%d"%i,score=x.FILTER[0]) for i,x in enumerate(dup_records)])     
    ins_bedtool = pybedtools.BedTool([pybedtools.Interval(x.CHROM, x.POS, (x.POS+1),
                                      name="INS_%d"%i,score=x.FILTER[0],otherfields=[x.INFO["SC_CHR2_STR"] if 
                                      "SC_CHR2_STR" in x.INFO else "."]) 
                                      for i,x in enumerate(ins_records)])     
    chr2_intervals=[]
    for interval in ins_bedtool:
        chr2_intervals.extend(build_chr2_ins(interval))
    
    chr2_ins_bedtool = pybedtools.BedTool(chr2_intervals).sort()
    
    idp_bedtool=dup_bedtool.window(del_bedtool,w=wiggle).each(partial(find_idp,wiggle=wiggle)).sort()
    remained_dup_bedtool=dup_bedtool.intersect(idp_bedtool,f=0.95,r=True,wa=True,v=True).sort()
    remained_del_bedtool=del_bedtool.intersect(idp_bedtool.each(partial(extract_del_interval)).sort(),f=0.95,r=True,wa=True,v=True)
    itx_bedtool=idp_bedtool.window(idp_bedtool,w=wiggle).each(partial(find_itx,wiggle=wiggle)).sort()
    remained_idp_bedtool_1=idp_bedtool.window(itx_bedtool,w=wiggle).each(partial(filter_itxs)).sort() 
    remained_idp_bedtool_2=idp_bedtool.window(itx_bedtool,w=wiggle,c=True).filter(lambda x:x.fields[-1]=="0").sort()


    ctx_bedtool=remained_del_bedtool.intersect(chr2_ins_bedtool,r=True,f=overlap_ratio,wa=True,wb=True).each(
                                            partial(find_ctx,overlap_ratio=overlap_ratio)).sort()
    remained_del_bedtool=remained_del_bedtool.intersect(ctx_bedtool,f=0.95,r=True,wa=True,v=True).sort()

    if len(remained_idp_bedtool_2)>0:
        remained_idp_bedtool_2=remained_idp_bedtool_2.cut(range(idp_bedtool.field_count())).sort()

    recoverd_pass_del_dup_ins=[]
    removed_pass_del_dup_ins=[]
    for bed in remained_idp_bedtool_1,remained_idp_bedtool_2,itx_bedtool,ctx_bedtool:
        recoverd_pass_del_dup_ins.append(",".join(map(lambda y: y.name,filter(lambda x: "LowQual" in x.score,bed))))
        removed_pass_del_dup_ins.append(",".join(map(lambda y: y.name,filter(lambda x: "LowQual" not in x.score,bed))))

    recoverd_pass_del_dup_ins=set((",".join(recoverd_pass_del_dup_ins)).split(","))-set([''])
    removed_pass_del_dup_ins=set((",".join(removed_pass_del_dup_ins)).split(","))-set([''])
    recoverd_pass_del_dup_ins = recoverd_pass_del_dup_ins - removed_pass_del_dup_ins




    
    recoverd_dups=list(set([x.name for x in remained_dup_bedtool])|set(filter(lambda x: "DUP" in x,recoverd_pass_del_dup_ins)))
    recoverd_dels=list(set([x.name for x in remained_del_bedtool])|set(filter(lambda x: "DEL" in x,recoverd_pass_del_dup_ins)))
    recoverd_inss=list(set([x.name for x in ins_bedtool])-(set(filter(lambda x: "INS" in x,removed_pass_del_dup_ins))))
    
        
    vcf_records = other_records + [dup_records[int(x.split("_")[-1])] for x in recoverd_dups] + \
                                  [del_records[int(x.split("_")[-1])] for x in recoverd_dels] + \
                                  [ins_records[int(x.split("_")[-1])] for x in recoverd_inss] + \
                                  [merge_idp_itx(fasta_file,dup_records[int(x.name.split(",")[0].split("_")[-1])],
                                             [del_records[int(x.name.split(",")[1].split("_")[-1])]],
                                             int(x.fields[6]),x.fields[7],x.score,"IDP") for x in remained_idp_bedtool_1] + \
                                  [merge_idp_itx(fasta_file,dup_records[int(x.name.split(",")[0].split("_")[-1])],
                                             [del_records[int(x.name.split(",")[1].split("_")[-1])]],
                                             int(x.fields[6]),x.fields[7],x.score,"IDP") for x in remained_idp_bedtool_2] + \
                                  [merge_idp_itx(fasta_file,dup_records[int(x.name.split(",")[0].split("_")[-1])],
                                             [del_records[int(x.name.split(",")[1].split("_")[-1])],
                                             del_records[int(x.name.split(",")[2].split("_")[-1])]],
                                             int(x.fields[6]),x.fields[7],x.score,"ITX") for x in itx_bedtool] + \
                                  [merge_ctx(fasta_file,del_records[int(x.name.split(",")[0].split("_")[-1])],
                                             ins_records[int(x.name.split(",")[1].split("_")[-1])],
                                             x.score) for x in ctx_bedtool] 


                                         
    vcf_records = sorted(map(lambda x: remove_info_fields(x,["SC_CHR2_STR"]),vcf_records), key = lambda x: (x.CHROM, x.POS))
    return vcf_records
Beispiel #29
0
def filter_candidates(
    (candidates_vcf, filtered_candidates_vcf, reference, dbsnp, min_dp,
     good_ao, min_ao, snp_min_af, snp_min_bq, snp_min_ao, ins_min_af,
     del_min_af, del_merge_min_af, ins_merge_min_af, merge_r)):

    logger.info("-----------------------------------------------------------")
    logger.info("Filter Candidates")
    logger.info("-----------------------------------------------------------")

    records = {}
    with open(candidates_vcf) as v_f:
        for line in v_f:
            if line[0] == "#":
                continue
            if len(line.strip().split()) != 10:
                raise RuntimeError(
                    "Bad VCF line (<10 fields): {}".format(line))
            chrom, pos, _, ref, alt, _, _, info_, _, info = line.strip().split(
            )
            pos = int(pos)
            loc = "{}.{}".format(chrom, pos)
            dp, ro, ao = map(int, info.split(":")[1:4])
            info_dict = dict(
                map(lambda x: x.split("="),
                    filter(lambda x: x, info_.split(";"))))
            mq_ = safe_read_info_dict(info_dict, "MQ", int, -100)
            bq_ = safe_read_info_dict(info_dict, "BQ", int, -100)
            nm_ = safe_read_info_dict(info_dict, "NM", int, -100)
            as_ = safe_read_info_dict(info_dict, "AS", int, -100)
            xs_ = safe_read_info_dict(info_dict, "XS", int, -100)
            pr_ = safe_read_info_dict(info_dict, "PR", int, -100)
            cl_ = safe_read_info_dict(info_dict, "CL", int, -100)
            st_ = safe_read_info_dict(info_dict, "ST", str, "-100,-100")
            ls_ = safe_read_info_dict(info_dict, "LS", int, -100)
            rs_ = safe_read_info_dict(info_dict, "RS", int, -100)

            if ao < min(ro, min_ao):
                continue

            if loc not in records:
                records[loc] = []
            if ref == "N" or "\t".join(line.split()[0:5]) \
                    not in map(lambda x: "\t".join(x[-1].split()[0:5]), records[loc]):
                records[loc].append([
                    chrom, pos, ref, alt, dp, ro, ao, mq_, bq_, st_, ls_, rs_,
                    nm_, as_, xs_, pr_, cl_, line
                ])
            elif "\t".join(line.split()[0:5]) \
                    in map(lambda x: "\t".join(x[-1].split()[0:5]), records[loc]):
                for i, x in enumerate(records[loc]):
                    if "\t".join(line.split()[0:5]) == "\t".join(x[-1].split()[0:5]) \
                            and ao / float(ro + 0.0001) > x[6] / float(x[5] + 0.0001):
                        records[loc][i] = [
                            chrom, pos, ref, alt, dp, ro, ao, mq_, bq_, st_,
                            ls_, rs_, nm_, as_, xs_, pr_, cl_, line
                        ]
                        break
    fasta_file = pysam.Fastafile(reference)
    good_records = []
    dels = []
    for loc, rs in sorted(records.iteritems(), key=lambda x: x[1][0:2]) + \
            [["", [["", 0, "", "", 0, 0, 0, ""]]]]:
        ins = filter(lambda x: x[2] == "N", rs)
        if len(ins) > 1:
            # emit ins
            afs = map(lambda x: x[6] / float(x[5] + x[6]), ins)
            max_af = max(afs)
            ins = filter(
                lambda x: x[6] / float(x[5] + x[6]) >= (max_af * merge_r), ins)
            chrom, pos, ref = ins[0][0:3]
            dp = max(map(lambda x: x[4], ins))
            ro = max(map(lambda x: x[5], ins))
            ao = max(map(lambda x: x[6], ins))
            mq_ = max(map(lambda x: x[7], ins))
            bq_ = max(map(lambda x: x[8], ins))
            st_ = "{},{}".format(
                max(map(lambda x: int(x[9].split(",")[0]), ins)),
                max(map(lambda x: int(x[9].split(",")[1]), ins)))
            ls_ = max(map(lambda x: x[10], ins))
            rs_ = max(map(lambda x: x[11], ins))
            nm_ = max(map(lambda x: x[12], ins))
            as_ = max(map(lambda x: x[13], ins))
            xs_ = max(map(lambda x: x[14], ins))
            pr_ = max(map(lambda x: x[15], ins))
            cl_ = max(map(lambda x: x[16], ins))
            alt = "".join(map(lambda x: x[3], ins))
            if (max_af >= ins_merge_min_af) or (ao >= good_ao):
                ins = [[
                    chrom, pos, ref, alt, dp, ro, ao, mq_, bq_, st_, ls_, rs_,
                    nm_, as_, xs_, pr_, cl_
                ]]
            else:
                ins = []
        elif len(ins) == 1:
            # emit 1-base ins
            dp, ro, ao = ins[0][4:7]
            if (ao / float(ro + ao) <
                (ins_min_af) and ao < good_ao) or dp <= 5:
                ins = []
            else:
                ins = [ins[0][:-1]]
        good_records.extend(ins)
        if dels and (ins or filter(lambda x: x[3] != "N" and x[2] != "N", rs)):
            # emit del
            if len(dels) == 1:
                ro = dels[0][5]
                ao = dels[0][6]
                chrom, pos, ref = dels[0][0:3]
                if ao / float(ro + ao) >= ((del_min_af)) or ao >= good_ao:
                    good_records.extend(dels)

            else:
                afs = map(lambda x: x[6] / float(x[5] + x[6]), dels)
                max_af = max(afs)
                merge_r_thr = merge_r * max_af
                dels = filter(
                    lambda x: x[6] / float(x[5] + x[6]) >= merge_r_thr, dels)
                chrom, pos = dels[0][0:2]
                dp = max(map(lambda x: x[4], dels))
                ro = max(map(lambda x: x[5], dels))
                ao = max(map(lambda x: x[6], dels))
                mq_ = max(map(lambda x: x[7], dels))
                bq_ = max(map(lambda x: x[8], dels))
                st_ = "{},{}".format(
                    max(map(lambda x: int(x[9].split(",")[0]), dels)),
                    max(map(lambda x: int(x[9].split(",")[1]), dels)))
                ls_ = max(map(lambda x: x[10], dels))
                rs_ = max(map(lambda x: x[11], dels))
                nm_ = max(map(lambda x: x[12], dels))
                as_ = max(map(lambda x: x[13], dels))
                xs_ = max(map(lambda x: x[14], dels))
                pr_ = max(map(lambda x: x[15], dels))
                cl_ = max(map(lambda x: x[16], dels))
                ref = "".join(map(lambda x: x[2], dels))
                alt = "N"
                good_records.append([
                    chrom, pos, ref, alt, dp, ro, ao, mq_, bq_, st_, ls_, rs_,
                    nm_, as_, xs_, pr_, cl_
                ])
            dels = []
        if not loc:
            continue

        for record in rs:
            dp = record[4]
            if dp <= min_dp:
                continue
            ro, ao = record[5:7]
            if record[2] != "N" and record[3] != "N" and record[2] != record[3]:
                bq = record[8]
                if (ao / float(ro + ao) >=
                    (snp_min_af) or ao >= snp_min_ao) and bq >= snp_min_bq:
                    # emit SNP
                    good_records.append(record[:-1])
            elif record[2] != "N" and record[3] == "N":
                if ao / float(ro + ao) >= (del_merge_min_af) or ao >= good_ao:
                    chrom, pos = record[0:2]
                    if dels and pos - dels[-1][1] != 1:
                        # emit del
                        if len(dels) == 1:
                            ro = dels[0][5]
                            ao = dels[0][6]
                            chrom, pos, ref = dels[0][0:3]
                            pos = int(pos)
                            if ao / float(ro + ao) >= ((del_min_af)):
                                good_records.extend(dels)
                        else:
                            afs = map(lambda x: x[6] / float(x[5] + x[6]),
                                      dels)
                            max_af = max(afs)
                            merge_r_thr = merge_r * max_af
                            dels = filter(
                                lambda x: x[6] / float(x[5] + x[6]) >=
                                merge_r_thr, dels)
                            chrom, pos = dels[0][0:2]
                            dp = max(map(lambda x: x[4], dels))
                            ro = max(map(lambda x: x[5], dels))
                            ao = max(map(lambda x: x[6], dels))
                            mq_ = max(map(lambda x: x[7], dels))
                            bq_ = max(map(lambda x: x[8], dels))
                            st_ = "{},{}".format(
                                max(
                                    map(lambda x: int(x[9].split(",")[0]),
                                        dels)),
                                max(
                                    map(lambda x: int(x[9].split(",")[1]),
                                        dels)))
                            ls_ = max(map(lambda x: x[10], dels))
                            rs_ = max(map(lambda x: x[11], dels))
                            nm_ = max(map(lambda x: x[12], dels))
                            as_ = max(map(lambda x: x[13], dels))
                            xs_ = max(map(lambda x: x[14], dels))
                            pr_ = max(map(lambda x: x[15], dels))
                            cl_ = max(map(lambda x: x[16], dels))
                            ref = "".join(map(lambda x: x[2], dels))
                            alt = "N"
                            good_records.append([
                                chrom, pos, ref, alt, dp, ro, ao, mq_, bq_,
                                st_, ls_, rs_, nm_, as_, xs_, pr_, cl_
                            ])
                        dels = []
                    # accumulate dels
                    dels.append(record[:-1])

    final_records = []
    dels = []
    for i, record in enumerate(good_records):
        chrom, pos, ref, alt, dp, ro, ao, mq_, bq_, st_, ls_, rs_, nm_, as_, xs_, pr_, cl_ = record
        ref = ref.upper()
        alt = alt.upper()
        info_str = ""
        if st_ != "-100,-100":
            info_str += ";ST={}".format(st_)
        if ls_ != -100:
            info_str += ";LS={}".format(ls_)
        if rs_ != -100:
            info_str += ";RS={}".format(rs_)
        if nm_ != -100:
            info_str += ";NM={}".format(nm_)
        if as_ != -100:
            info_str += ";AS={}".format(as_)
        if xs_ != -100:
            info_str += ";XS={}".format(xs_)
        if pr_ != -100:
            info_str += ";PR={}".format(pr_)
        if cl_ != -100:
            info_str += ";CL={}".format(cl_)
        if mq_ != -100:
            info_str += ";MQ={}".format(mq_)
        if bq_ != -100:
            info_str += ";BQ={}".format(bq_)

        af = np.round(ao / float(ao + ro), 4)
        info_str += ";AF={}".format(af)
        if ref != "N" and alt != "N":
            line = "\t".join([
                chrom,
                str(pos), ".", ref, alt, "100", ".",
                "DP={};RO={};AO={}".format(dp, ro, ao) + info_str,
                "GT:DP:RO:AO:AF", "0/1:{}:{}:{}:{}".format(dp, ro, ao, af)
            ])
            final_records.append([chrom, pos, ref, alt, line])
        elif alt == "N":
            ref = fasta_file.fetch(chrom, pos - 2, pos + len(ref) - 1).upper()
            alt = fasta_file.fetch(chrom, pos - 2, pos - 1).upper()
            line = "\t".join([
                chrom,
                str(pos - 1), ".", ref, alt, "100", ".",
                "DP={};RO={};AO={}".format(dp, ro, ao) + info_str,
                "GT:DP:RO:AO:AF", "0/1:{}:{}:{}:{}".format(dp, ro, ao, af)
            ])
            final_records.append([chrom, pos - 1, ref, alt, line])
        elif ref == "N":
            ref = fasta_file.fetch(chrom, pos - 2, pos - 1).upper()
            alt = ref + alt
            line = "\t".join([
                chrom,
                str(pos - 1), ".", ref, alt, "100", ".",
                "DP={};RO={};AO={}".format(dp, ro, ao) + info_str,
                "GT:DP:RO:AO:AF", "0/1:{}:{}:{}:{}".format(dp, ro, ao, af)
            ])
            final_records.append([chrom, pos - 1, ref, alt, line])
    final_records = sorted(final_records, key=lambda x: x[0:2])
    if dbsnp:
        filtered_bed = pybedtools.BedTool(
            map(
                lambda x: pybedtools.Interval(x[1][0], int(x[1][1]),
                                              int(x[1][1]) + 1, x[1][2], x[1][
                                                  3], str(x[0])),
                enumerate(final_records))).sort()
        dbsnp = pybedtools.BedTool(dbsnp).each(
            lambda x: pybedtools.Interval(x[0], int(x[1]),
                                          int(x[1]) + 1, x[3], x[4])).sort()
        non_in_dbsnp_1 = filtered_bed.window(dbsnp, w=0, v=True)
        non_in_dbsnp_2 = filtered_bed.window(dbsnp, w=0).filter(
            lambda x: x[1] != x[7] or x[3] != x[9] or x[4] != x[10]).sort()
        non_in_dbsnp_ids = []
        for x in non_in_dbsnp_1:
            non_in_dbsnp_ids.append(int(x[5]))
        for x in non_in_dbsnp_2:
            non_in_dbsnp_ids.append(int(x[5]))
        final_records = map(
            lambda x: x[1],
            filter(lambda x: x[0] in non_in_dbsnp_ids,
                   enumerate(final_records)))
    with open(filtered_candidates_vcf, "w") as o_f:
        o_f.write("##fileformat=VCFv4.2\n")
        o_f.write(
            "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n")
        for record in final_records:
            o_f.write(record[-1] + "\n")
    return filtered_candidates_vcf
Beispiel #30
0
def convert_metasv_bed_to_vcf(bedfile=None, vcf_out=None, workdir=None, vcf_template_file=vcf_template, sample=None, reference=None,
                              pass_calls=True):
    func_logger = logging.getLogger("%s" % (convert_metasv_bed_to_vcf.__name__))
    if not os.path.exists(workdir):
        os.makedirs(workdir)

    intervals = []
    if bedfile:
    
        for interval in pybedtools.BedTool(bedfile):
            interval_info = get_interval_info(interval,pass_calls)            
            if interval_info:
                updated_interval = pybedtools.Interval(interval.chrom, interval_info["pos"], 
                                                       interval_info["end"], name="%s,%s,%d,%s" % (
                                                       base64.b64encode(json.dumps(interval_info["info"])), 
                                                       interval_info["sv_type"], interval_info["sv_length"],
                                                       ";".join(interval_info["svmethods"])), 
                                                       score = interval.score, 
                                                       otherfields=[interval_info["genotype"]
                                                                    , interval_info["sv_filter"]])
                if not intervals:
                    intervals.append(updated_interval)
                else:
                    merged_interval=check_duplicates(updated_interval,intervals[-1])
                    if merged_interval:
                        func_logger.info("Merging intervals: %s and %s" % (updated_interval,intervals[-1]))
                        intervals.pop()
                        intervals.append(merged_interval)
                    else:
                        intervals.append(updated_interval)
            else: 
                func_logger.info("Skip interval: %s" % (interval))

    nonfilterd_bed = os.path.join(workdir, "final_nonfilterd.bed")
    filterd_bed = os.path.join(workdir, "final_filterd.bed")
    bedtool = pybedtools.BedTool(intervals).sort().moveto(nonfilterd_bed)
    filterd_bed = filter_confused_INS_calls(nonfilterd_bed,filterd_bed)    

    vcf_template_reader = vcf.Reader(open(vcf_template_file, "r"))
    # The following are hacks to ensure sample name and contig names are put in the VCF header
    vcf_template_reader.samples = [sample]
    contigs = []
    fasta_file = None
    if reference:
        contigs = fasta_utils.get_contigs(reference)
        contigs_order_dict = {contig.name: index for (index, contig) in enumerate(contigs)}
        vcf_template_reader.contigs = OrderedDict([(contig.name, (contig.name, contig.length)) for contig in contigs])
        vcf_template_reader.metadata["reference"] = reference
        fasta_file = pysam.Fastafile(reference)

    vcf_template_reader.metadata["fileDate"] = str(datetime.date.today())
    vcf_template_reader.metadata["source"] = [" ".join(sys.argv)]
    vcf_writer = vcf.Writer(open(vcf_out, "w"), vcf_template_reader)
    vcf_records = []
    if filterd_bed:
        bedtool = pybedtools.BedTool(filterd_bed)
        for interval in bedtool:
            name_split=interval.name.split(",")
            info = json.loads(base64.b64decode(name_split[0]))
            sv_type = name_split[1]
            sv_id = "."
            ref = fasta_file.fetch(str(interval.chrom), interval.start, interval.start + 1) if fasta_file else "."
            alt = [vcf.model._SV(sv_type)]
            qual = "."
            sv_filter = [interval.fields[7]]
            genotype = interval.fields[6]
            sv_format = "GT"
            sample_indexes = [0]
            vcf_record = vcf.model._Record(interval.chrom, interval.start, sv_id, ref, alt, qual,
                                           sv_filter, info, sv_format, sample_indexes)
            vcf_record.samples = vcf_template_reader._parse_samples([genotype], "GT", vcf_record)
            vcf_records.append(vcf_record)
            
    if contigs:
        vcf_records.sort(key=lambda x: (contigs_order_dict[x.CHROM], x.POS))
    else:
        vcf_records.sort(key=lambda x: (x.CHROM, x.POS))

    resolved_vcf_records = resolve_for_IDP_ITX_CTX(vcf_records,fasta_file)

    for vcf_record in resolved_vcf_records:
        vcf_writer.write_record(vcf_record)
    vcf_writer.close()

    func_logger.info("Tabix compressing and indexing %s" % vcf_out)
    pysam.tabix_index(vcf_out, force=True, preset="vcf")