def pos_counter_by_pos(bam_fpath, positions): alignmentfile = AlignmentFile(bam_fpath) ref_name_index = {} for pileup_col in alignmentfile.pileup(): ref_id = pileup_col.reference_id ref_pos = pileup_col.reference_pos ref_name = ref_name_index.get(ref_id, alignmentfile.getrname(ref_id)) position = (ref_name, ref_pos) if position in positions: pos_counter = Counter() print position for pileup_read in pileup_col.pileups: alignement = pileup_read.alignment blocks = alignement.get_blocks() if pileup_read.alignment.is_reverse: base_pos = (pileup_read.alignment.query_length - pileup_read.query_position - 1) else: base_pos = pileup_read.query_position # # base_pos = pileup_read.query_position print base_pos, pileup_read.alignment.qname pos_counter[base_pos] += 1 yield pos_counter
def downgrade_read_edges(in_fpath, out_fpath, size, qual_to_substract=QUAL_TO_SUBSTRACT): in_sam = AlignmentFile(in_fpath) out_sam = AlignmentFile(out_fpath, 'wb', template=in_sam) for aligned_read in in_sam: _downgrade_edge_qualities(aligned_read, size, qual_to_substract=qual_to_substract) out_sam.write(aligned_read)
def downgrade_read_edges(in_fpath, out_fpath, size, qual_to_substract=QUAL_TO_SUBSTRACT): in_sam = AlignmentFile(in_fpath) out_sam = AlignmentFile(out_fpath, 'wb', template=in_sam) for aligned_read in in_sam: if (aligned_read.has_tag(LEFT_DOWNGRADED_TAG) or aligned_read.has_tag(RIGTH_DOWNGRADED_TAG)): raise RuntimeError('Edge qualities already downgraded\n') _downgrade_edge_qualities(aligned_read, size, qual_to_substract=qual_to_substract) out_sam.write(aligned_read)
def classify_mapped_reads(bam_fhand, mate_distance, settings=get_setting('CHIMERAS_SETTINGS')): '''It classifies sequences from bam file in chimeric, unknown and non chimeric, according to its distance and orientation in the reference sequence''' bamfile = AlignmentFile(bam_fhand.name) # settings. Include in function properties with default values max_clipping = settings['MAX_CLIPPING'] max_pe_len = settings['MAX_PE_LEN'] variation = settings['MATE_DISTANCE_VARIATION'] mate_length_range = [mate_distance - variation, mate_distance + variation] reference_lengths = _get_ref_lengths(bamfile) # It tries to find out the kind of each pair of sequences for grouped_mates in _group_alignments_reads_by_qname(bamfile): mates_alignments = _split_mates(grouped_mates) if _mates_are_not_chimeric(mates_alignments, max_clipping, mate_length_range, bamfile, reference_lengths): kind = NON_CHIMERIC elif _mates_are_chimeric(mates_alignments, bamfile, max_clipping, max_pe_len, reference_lengths): kind = CHIMERA else: kind = UNKNOWN pair = [ alignedread_to_seqitem(_get_primary_alignment(mates)) for mates in mates_alignments ] if None not in pair: yield pair, kind
def mapped_count_by_rg(bam_fpaths, mapqx=None): do_mapqx = True if mapqx is not None else False counter_by_rg = {} for bam_fpath in bam_fpaths: bam = AlignmentFile(bam_fpath, 'rb') readgroups = get_bam_readgroups(bam) if readgroups is None: bam_basename = os.path.splitext(os.path.basename(bam_fpath))[0] readgroups = [bam_basename] else: readgroups = [rg['ID'] for rg in readgroups] for readgroup in readgroups: counter = IntCounter({'unmapped': 0, 'mapped': 0}) if do_mapqx: counter['bigger_mapqx'] = 0 counter_by_rg[readgroup] = counter for read in bam: rg = get_rg_from_alignedread(read) if rg is None: rg = bam_basename if do_mapqx and read.mapq >= mapqx: counter_by_rg[rg]['bigger_mapqx'] += 1 if read.is_unmapped: counter_by_rg[rg]['unmapped'] += 1 else: counter_by_rg[rg]['mapped'] += 1 return counter_by_rg
def calculate_distance_distribution_in_bam(bam_fhand, max_clipping, max_distance=None): bamfile = AlignmentFile(bam_fhand.name) stats = { 'outies': IntCounter(), 'innies': IntCounter(), 'others': IntCounter() } for grouped_mates in _group_alignments_reads_by_qname(bamfile): mates = _split_mates(grouped_mates) for aligned_read1 in _get_totally_mapped_alignments( mates[0], max_clipping): for aligned_read2 in _get_totally_mapped_alignments( mates[1], max_clipping): if aligned_read1.rname == aligned_read2.rname: aligned_reads = [aligned_read1, aligned_read2] distance = _find_distance(aligned_reads) if _mates_are_outies(aligned_reads): kind = 'outies' elif _mates_are_innies(aligned_reads): kind = 'innies' else: kind = 'others' if max_distance is None or max_distance > distance: stats[kind][distance] += 1 return stats
def __call__(self, trim_packet): 'It trims the seqs' self._pre_trim(trim_packet) trimmed_seqs = [] bamfile = AlignmentFile(self._bam_fhand.name) for grouped_mates in _group_alignments_reads_by_qname(bamfile): for aligned_reads in _split_mates(grouped_mates): trimmed_seqs.append([self._do_trim(aligned_reads)]) self._post_trim() return { SEQS_PASSED: trimmed_seqs, ORPHAN_SEQS: trim_packet[ORPHAN_SEQS] }
def pos_counter_by_pos(bam_fpath, positions): alignmentfile = AlignmentFile(bam_fpath) ref_name_index = {} for pileup_col in alignmentfile.pileup(): ref_id = pileup_col.reference_id ref_pos = pileup_col.reference_pos ref_name = ref_name_index.get(ref_id, alignmentfile.getrname(ref_id)) position = (ref_name, ref_pos) if position in positions: pos_counter = Counter() print position for pileup_read in pileup_col.pileups: alignement = pileup_read.alignment blocks = alignement.get_blocks() if pileup_read.alignment.is_reverse: base_pos = pileup_read.alignment.query_length - pileup_read.query_position - 1 else: base_pos = pileup_read.query_position # # base_pos = pileup_read.query_position print base_pos, pileup_read.alignment.qname pos_counter[base_pos] += 1 yield pos_counter
def sort_by_position_in_ref(in_fhand, index_fpath, directory=None, tempdir=None): # changed to bwa mem from bowtie, test doesn't work well, check it out in_fpath = in_fhand.name file_format = get_format(open(in_fpath)) extra_params = ['--very-fast'] if 'fasta' in file_format: extra_params.append('-f') bowtie2_process = map_with_bowtie2(index_fpath, paired_fpaths=None, unpaired_fpath=in_fpath, extra_params=extra_params) out_fhand = NamedTemporaryFile() map_process_to_sortedbam(bowtie2_process, out_fhand.name, tempdir=tempdir) samfile = AlignmentFile(out_fhand.name) for aligned_read in samfile: yield alignedread_to_seqitem(aligned_read)
def _prepare_bams(self, bam_fpaths): bams = [] rgs = {} for idx, bam_fpath in enumerate(bam_fpaths): bam = AlignmentFile(bam_fpath) rgs_ = get_bam_readgroups(bam) if rgs_ is None: rgs_ = [{'ID': None, self.bam_rg_field_for_vcf_sample: str(None)}] bams.append({'fpath': bam_fpath, 'rgs': rgs_}) for read_group in rgs_: read_group['bam'] = idx rgs[read_group['ID']] = read_group self._bams = bams self._rgs = rgs # We have to assume that all bmas have the same references ref_lens = {ref: le_ for ref, le_ in zip(bam.references, bam.lengths)} self._ref_lens = ref_lens
def calculate_distance_distribution(interleave_fhand, index_fpath, max_clipping, max_distance=None, tempdir=None, threads=None): bam_fhand = NamedTemporaryFile(suffix='.bam') extra_params = ['-a', '-M'] bwa = map_with_bwamem(index_fpath, interleave_fpath=interleave_fhand.name, extra_params=extra_params, threads=threads) map_process_to_sortedbam(bwa, bam_fhand.name, key='queryname', tempdir=tempdir) bamfile = AlignmentFile(bam_fhand.name) stats = { 'outies': IntCounter(), 'innies': IntCounter(), 'others': IntCounter() } for grouped_mates in _group_alignments_reads_by_qname(bamfile): mates = _split_mates(grouped_mates) for aligned_read1 in _get_totally_mapped_alignments( mates[0], max_clipping): for aligned_read2 in _get_totally_mapped_alignments( mates[1], max_clipping): if aligned_read1.rname == aligned_read2.rname: aligned_reads = [aligned_read1, aligned_read2] distance = _find_distance(aligned_reads) if _mates_are_outies(aligned_reads): kind = 'outies' elif _mates_are_innies(aligned_reads): kind = 'innies' else: kind = 'others' if max_distance is None or max_distance > distance: stats[kind][distance] += 1 return stats
def _get_mapped_reads(bam_fpath, min_mapq=0): bam = AlignmentFile(bam_fpath) return [ read.qname for read in bam if not read.is_unmapped and (not min_mapq or read.mapq > min_mapq) ]