def test_get_read_group_info(): 'Tests get_read_group_info' sam_sample = '''@SQ\tSN:SGN-U576692\tLN:1714 @SQ\tSN:SGN-U572743\tLN:833 @RG\tID:g1\tLB:g1\tSM:g1\tPL:sanger @RG\tID:g3\tLB:g3\tSM:g3\tPL:sanger SGN-E200000\t0\tSGN-U572743\t317\t226\t14M\t*\t0\t0\tGGATGATKTTAGAG\t*\tAS:i:250\tXS:i:0\tXF:i:0\tXE:i:7\tXN:i:0\tRG:Z:g1 SGN-E40000\t0\tSGN-U576692\t1416\t207\t10M\t*\t0\t0\tAGCCTGATAA\t,,09377777\tAS:i:160\tXS:i:0\tXF:i:3\tXE:i:4\tXN:i:0\tRG:Z:g3 SGN-E40000\t20\tSGN-U576692\t1416\t207\t10M\t*\t0\t0\tAGCCTGATAA\t,,09377777\tAS:i:160\tXS:i:0\tXF:i:3\tXE:i:4\tXN:i:0\tRG:Z:g3 ''' sam_fhand = NamedTemporaryFile(suffix='.sam') sam_fhand.write(sam_sample) sam_fhand.flush() bam_fhand = NamedTemporaryFile(suffix='.bam') sam2bam(sam_fhand.name, bam_fhand.name) bam_fhand.flush() bam = pysam.Samfile(bam_fhand.name, 'rb') read_gro_i = get_read_group_info(bam) assert read_gro_i == {'g3': {'LB': 'g3', 'SM': 'g3', 'PL': 'sanger'}, 'g1': {'LB': 'g1', 'SM': 'g1', 'PL': 'sanger'}}
def _snvs_in_bam(bam, reference, min_quality, default_sanger_quality, min_mapq, min_num_alleles, max_maf, min_num_reads_for_allele, read_edge_conf=None, default_bam_platform=None): 'It yields the snv information for every snv in the given reference' min_num_alleles = int(min_num_alleles) read_groups_info = get_read_group_info(bam) if not read_groups_info: if default_bam_platform is None: msg = 'Platform is not present either in header or in ' msg += 'configuration' raise ValueError(msg) read_groups_info = {UNKNOWN_RG:{'PL':default_bam_platform}} reference_id = get_seq_name(reference) reference_seq = reference.seq reference_len = len(reference_seq) #we can clean the cache of segments because we're in a new molecule global SEGMENTS_CACHE SEGMENTS_CACHE = {} for column in bam.pileup(reference=reference_id): alleles = {} ref_pos = column.pos if ref_pos >= reference_len: continue ref_id = bam.getrname(column.tid) ref_allele = reference_seq[ref_pos].upper() for pileup_read in column.pileups: #for each read in the column we add its allele to the alleles dict aligned_read = pileup_read.alignment read_mapping_qual = aligned_read.mapq #We ignore the reads that are likely to be missaligned if read_mapping_qual < min_mapq: continue try: read_group = aligned_read.opt('RG') except KeyError: read_group = UNKNOWN_RG read_name = aligned_read.qname if read_groups_info and read_group in read_groups_info: platform = read_groups_info[read_group]['PL'] else: platform = default_bam_platform read_pos = pileup_read.qpos alleles_here, read_limits = _get_alleles_from_read(ref_allele, ref_pos, pileup_read) if read_edge_conf and platform in read_edge_conf: edge_left, edge_right = read_edge_conf[platform] #if we're in the edge region to be ignored we continue to #the next read, because there's no allele to add for this one. if (edge_left is not None and read_limits[0] + edge_left > read_pos): continue if (edge_right is not None and read_pos > read_limits[1] - edge_right): continue for allele in alleles_here: allele, kind, qual, is_reverse = allele _add_allele(alleles, allele, kind, read_name, read_group, is_reverse, qual, read_mapping_qual, read_groups_info) #remove N _remove_alleles_n(alleles) #add default sanger qualities to the sanger reads with no quality _add_default_sanger_quality(alleles, default_sanger_quality, read_groups_info) #remove bad quality alleles _remove_bad_quality_alleles(alleles, min_quality) #check maf if not check_maf_ok(alleles, max_maf): continue # min_num_reads_for_allele _remove_alleles_by_read_number(alleles, min_num_reads_for_allele) #if there are a min_num number of alleles requested and there are more #alleles than that #OR #there is some allele different than invariant #a variation is yield if not alleles: continue if (len(alleles) > min_num_alleles or (min_num_alleles == 1 and alleles.keys()[0][1] != INVARIANT) or (min_num_alleles > 1 and len(alleles) >= min_num_alleles)): yield {'ref_name':ref_id, 'ref_position':ref_pos, 'reference_allele':ref_allele, 'alleles':alleles, 'read_groups':read_groups_info}