Esempio n. 1
0
def refine(options):
    """
    Refine TE insertion and deletion calls within a group of related samples
    Use indel calls from other samples in the group, inspect areas of the genome in samples where
    indel was not called, and look for evidence of the same indel with much lower
    read count threshold
    """
    te = pybedtools.BedTool(options.te).sort()
    names = readNames(options.all_samples)
    if options.insertions is not False:
        insertions = getOtherLines(names, options.insertions)
    if options.deletions is not False:
        deletions = getOtherLines(names, options.deletions)  # format ([data], [inverse_accessions])
    print "Processing "+options.name
    chrom_sizes = check_bam(options.conc, options.proc, options.prefix)
    check_bam(options.split, options.proc, options.prefix, make_new_index=True)
    cov = calc_cov(options.conc, 100000, 120000)
    concordant = pysam.AlignmentFile(options.conc, 'rb')
    split_alignments = pysam.AlignmentFile(options.split, 'rb')
    name_indexed = pysam.IndexedReads(split_alignments)
    name_indexed.build()
    if options.deletions is not False:
        print "  checking deletions"
        process_missed(deletions, "deletion", concordant, split_alignments, name_indexed, options.name, te, cov/5, chrom_sizes)
    else:
        pass
    if options.insertions is not False:
        print "  checking insertions"
        process_missed(insertions, "insertion", concordant, split_alignments, name_indexed, options.name, te, cov/10, chrom_sizes)
    else:
        pass
Esempio n. 2
0
def get_st_alignments(contigs, st_bam):
    bam = pysam.AlignmentFile(st_bam, 'rc')
    index = pysam.IndexedReads(bam)
    index.build()

    st_alignment = []
    for contig in contigs.contig_id.values:
        aligned_conts = [
            read.reference_name for read in index.find(contig)
            if read.reference_name
        ]
        aligned_conts = ','.join(
            np.unique(aligned_conts)) if len(aligned_conts) > 0 else ''
        st_alignment.append(aligned_conts)

    # get short gene name (first gene of every overlapping set of genes, include fusion genes)
    short_gnames = contigs.overlapping_genes.apply(get_short_gene_name)
    contig_ids, samples = contigs.contig_id, contigs['sample']
    con_names = [
        '|'.join([s, cid, sg])
        for cid, s, sg in zip(contig_ids, samples, short_gnames)
    ]

    contigs['expected_ST_alignment'] = con_names
    contigs['real_ST_alignment'] = st_alignment
    return contigs
Esempio n. 3
0
def extract_reads(options):
    n = get_names(options.names)

    bamfile = pysam.AlignmentFile(options.bam, 'rb')
    name_indexed = pysam.IndexedReads(bamfile)
    name_indexed.build()
    header = bamfile.header.copy()

    # out = pysam.Samfile(options.out, 'wb', header=header)

    for name in n:
        try:
            name_indexed.find(name)
        except KeyError:
            pass
        else:
            iterator = name_indexed.find(name)
            for read in iterator:
                # out.write(x)
                if read.is_reverse:
                    if read.qual:
                        sys.stdout.write("@{0}\n{1}\n+\n{2}\n".format(read.qname, read.seq.translate(revComp)[::-1], read.qual[::-1]))
                    else:
                        sys.stdout.write(">{0}\n{1}\n".format(read.qname, read.seq.translate(revComp)[::-1]))
                else:
                    if read.qual:
                        sys.stdout.write("@{0}\n{1}\n+\n{2}\n".format(read.qname, read.seq, read.qual))
                    else:
                        sys.stdout.write(">{0}\n{1}\n".format(read.qname, read.seq))
Esempio n. 4
0
 def build(self, id_queries: List[Tuple[str, str, str]],
           refdict: ReferenceDict):
     log.info("Building BAM Index in memory for fetching VJ queries")
     bam_indexed = pysam.IndexedReads(self.bam)
     bam_indexed.build()
     log.info("Built index in memory for fast retrieval")
     count = 0
     report_interval = max(1, len(id_queries) // 25)
     for id_query in id_queries:
         barcode, umi, query_name = id_query
         read = Read(query_name)
         read.parse_alignments(bam_indexed, refdict)
         if read.top_V is None or read.top_J is None:
             log.error(
                 f"Should have gotten a top V and J for {query_name} but did not!"
             )
         if barcode not in self:
             self[barcode] = Barcode(barcode)
         if umi not in self[barcode]:
             self[barcode][umi] = UMI(umi)
         self[barcode][umi][query_name] = read
         count += 1
         if count % report_interval == 0:
             log.info(
                 f"Stored top alignments for {count} reads ({(count/len(id_queries)):.0%})."
             )
Esempio n. 5
0
def bam2py(bamfile):
    # this is probably not very efficient as it uses up a lot of memory but it'll do the job for now
    bamfile = pysam.AlignmentFile(bamfile, 'rb')
    name_indexed = pysam.IndexedReads(bamfile)
    name_indexed.build()
    header = bamfile.header.copy()
    return header, name_indexed
Esempio n. 6
0
def rmb_dedup(basename):
    
    merge_bam_sort = "vector/" + basename + '_merge_vector.sort.bam'
    merge_bam_sort_dedup = "vector/" + basename + '_merge_vector_dedup.bam'
    merge_bam_sort_dedup_sort = "vector/" + basename + '_merge_vector_dedup.sort.bam'
    
    unique_barcode = "barcode/"+basename+"_barcode_uniq.txt"
    load(unique_barcode)
    # get uniq qname list
    data = pd.read_csv(unique_barcode,sep = '\t',names = [u'Qname',u'Barcode',u'Freq',u'Length'],low_memory=False)
    uniq_qname_list = data['Qname'].tolist()
    
    #generate dedup bam
    merge_bam = pysam.AlignmentFile(merge_bam_sort, 'rb')
    dedup_bam = pysam.AlignmentFile(merge_bam_sort_dedup, "wb", template=merge_bam)
    
    #index bam name by pysam to generate dedup bam
    name_indexed = pysam.IndexedReads(merge_bam)
    name_indexed.build()
    for name in uniq_qname_list:
            try:
                name_indexed.find(name)
            except KeyError:
                pass
            else:
                iterator = name_indexed.find(name)
                for x in iterator:
                    dedup_bam.write(x)
    merge_bam.close()
    dedup_bam.close()
    pysam.sort("-o", merge_bam_sort_dedup_sort, merge_bam_sort_dedup)
Esempio n. 7
0
def annotate_contigs(args):
    '''
    Extract aligned contigs from supplied bam file and output
    annotated contig if it contains any novel bits
    '''
    ref_trees, ex_tree, ex_ref = get_gene_lookup(args.tx_ref_file)
    juncs = get_junc_lookup(args.junc_file)

    bam = pysam.AlignmentFile(args.bam_file, 'rc')
    bam_idx = pysam.IndexedReads(bam, multiple_iterators=True)
    bam_idx.build()
    outbam_file_unsort = '%s_unsorted.bam' % os.path.splitext(args.output_bam)[0]
    outbam = pysam.AlignmentFile(outbam_file_unsort, 'wb', template=bam)
    ci_file = args.contig_info_output

    logging.info('Checking contigs for non-reference content...')
    for read in bam.fetch(multiple_iterators=True):
        if read.reference_id < 0:
            logging.info('Skipping unmapped contig %s.' % read.query_name)
            continue

        if not do_any_read_blocks_overlap_exons(read, ex_tree, bam_idx):
            logging.info('Skipping contig %s as it doesn\'t overlap any reference exons.' % read.query_name)
            continue

        if read.query_name in record:
            # we have processed this read already (likely a read's partner)
            continue

        # only consider the contig if at least match_min bases align
        # to reference and at least match_perc_min of the read aligns
        rlen = read.reference_length
        qlen = float(read.query_length)
        if (rlen < MIN_MATCH_BP) or (rlen / qlen) < MIN_MATCH_PERC:
            logging.info('Skipping contig %s: not enough bases match reference' % read.query_name)
            continue

        allmatch = all([op == constants.CIGAR['match'] for op, val in read.cigar])
        if len(read.get_blocks()) == 1 and allmatch:
            chr_ex = get_chrom_ref_tree(read.reference_name, ex_tree)
            s, e = read.get_blocks()[0]
            if not (chr_ex.overlaps(s, s + 1) and chr_ex.overlaps(e - 1, e)):
                # skip the contig if the contig start or end are outside exons
                logging.info('Skipping contig %s: unspliced contiguous alignment' % read.query_name)
                continue

        is_hardclipped = any([op == constants.CIGAR['hard-clip'] and val >= MIN_CLIP for op, val in read.cigar])
        if is_hardclipped:
            annotate_fusion(args, read, juncs, bam_idx, ex_ref, ref_trees, outbam)
        else:
            annotate_single_read(args, read, juncs, ex_ref, ref_trees, outbam)

    bam.close()
    outbam.close()

    # convert output sam file to bam, sort and index
    pysam.sort('-o', args.output_bam, outbam_file_unsort)
    pysam.index(args.output_bam)
    os.remove(outbam_file_unsort)
Esempio n. 8
0
def index_bam(filepath):
    """Creates an in-memory index for BAM file.

    :param filepath: Path to the BAM file.
    """
    index = pysam.IndexedReads(pysam.AlignmentFile(filepath))
    index.build()
    return index
Esempio n. 9
0
 def __init__(self, bam_file: Union[str, Path]):
     """
     Load alignment file and build index.
     """
     logging.debug(f"Loading {bam_file}.")
     self.alignments = pysam.AlignmentFile(bam_file)
     logging.debug(f"Building index for {bam_file}.")
     self.read_index = pysam.IndexedReads(self.alignments,
                                          multiple_iterators=True)
     self.read_index.build()
Esempio n. 10
0
def parse_cluster(path, bam_path, out_path, opt):
    # load the bam file
    logging.info("Loading the Bam file.")
    bamfile = pysam.AlignmentFile(bam_path, 'rb')
    name_indexed = pysam.IndexedReads(bamfile)
    name_indexed.build()

    # load cluster info
    logging.info("Loading the cluster file.")
    num = 0
    file = open(path, 'r')
    for line in file:
        num += 1
        if num % 100 == 0:
            logging.info("Finished %d clusters."%(num))
        seq = line.strip('\n').split('\t')
        chr = seq[0]
        breakpoint = seq[1]+'_'+seq[2]+'_'+seq[3]+'_'+str(len(seq[4:]))
        id_list = seq[4:]

        if len(id_list) < 5:
            continue

        if opt == "fq":
            file_path = "%s%s_%s.fq"%(out_path, chr, breakpoint)
        else:
            file_path = "%s%s_%s.fa"%(out_path, chr, breakpoint)
        out_file = open(file_path, 'w')

        for name in id_list:
            try:
                name_indexed.find(name)
            except KeyError:
                pass
            else:
                iterator = name_indexed.find(name)
                for read in iterator:
                    if read.is_reverse:
                        if opt == 'fq':
                            out_file.write("@{0}\n{1}\n+\n{2}\n".format(read.qname, read.seq.translate(revComp)[::-1], read.qual[::-1]))
                        else:
                            out_file.write(">{0}\n{1}\n".format(read.qname, read.seq.translate(revComp)[::-1]))
                            # out_file.write("@{0}\n{1}\n+\n{2}\n".format(read.qname, read.seq.translate(revComp)[::-1], read.qual[::-1]))
                    else:
                        if opt == 'fq':
                            out_file.write("@{0}\n{1}\n+\n{2}\n".format(read.qname, read.seq, read.qual))
                        else:
                            out_file.write(">{0}\n{1}\n".format(read.qname, read.seq))
        out_file.close()

    file.close()
Esempio n. 11
0
 def extract_raw_reads(self, read_names, inbamfile):
     raw_reads = []
     read_names_indexed = pysam.IndexedReads(inbamfile)
     read_names_indexed.build()
     for name in read_names:
         try:
             read_names_indexed.find(name)
         except KeyError:
             pass
         else:
             iterator = read_names_indexed.find(name)
             for x in iterator:
                 raw_reads.append(x)
     return raw_reads
Esempio n. 12
0
def get_pos(file_path, name):
    bamfile = sam.AlignmentFile(file_path, "rb")
    idx = sam.IndexedReads(bamfile)
    idx.build()
    name = idx.find(name)

    #finds first
    for read in name:
        pos = read.reference_start
        break

    bamfile.close()

    return pos
Esempio n. 13
0
def main(inbam1: str = typer.Option(..., help="BAM from ref1"),
         inbam2: str = typer.Option(..., help="BAM from ref2"),
         ctg_hap1: str = typer.Option(None, help="link this contig in ref1"),
         ctgs_hap1: str = typer.Option(
             None, help="link these contigs in ref1 (list file)"),
         outdir: str = typer.Option(...,
                                    help="output link file from ref1 to ref2"),
         savematrix: str = typer.Option(None,
                                        help="save link matrix to this file")):
    """根据reads比对情况,输出ref1到ref2的对应关系。
    只支持BAM文件。
    """
    t1 = time.time()
    print('load reference infor')
    chrom2offset_hap1, chrom2length_hap1, rnames_hap1, offsets_hap1 = load_refinfo(
        inbam1)
    chrom2offset_hap2, chrom2length_hap2, rnames_hap2, offsets_hap2 = load_refinfo(
        inbam2)

    print('build reads index')
    alnfile_hap2 = pysam.AlignmentFile(inbam2, 'rb')
    name_indexed_hap2 = pysam.IndexedReads(alnfile_hap2)
    name_indexed_hap2.build()

    print('start link:')
    if ctgs_hap1:
        ctgs_hap1 = [x.strip() for x in open(ctgs_hap1)]
    else:
        ctgs_hap1 = [
            ctg_hap1,
        ]
    for ctg_hap1 in ctgs_hap1:
        print(f'ctg: {ctg_hap1}')
        matrix_R2T = load_pos_to_matrix(inbam1, name_indexed_hap2, ctg_hap1,
                                        chrom2length_hap1, chrom2length_hap2,
                                        chrom2offset_hap2)
        if savematrix:
            save_npz(os.path.join(savematrix, f'{ctg_hap1}.npz'), matrix_R2T)


#        matrix_to_pos(ctg_hap1, matrix_R2T, os.path.join(outdir, f'{ctg_hap1}.linkpos.tsv.gz'), rnames_hap2, offsets_hap2, chrom2offset_hap2)

    t2 = time.time()
    runtime = t2 - t1
    h = runtime // 3600
    m = (runtime - h * 3600) // 60
    s = runtime - h * 3600 - m * 60
    print(f'Finished! Executed in {h:.0f}h {m:.0f}m {s:.0f}s ({runtime}s)')
Esempio n. 14
0
def extract_reads(options):
    n = get_names(options.names)
    bamfile = pysam.AlignmentFile(options.bam, 'rb', check_sq=False)
    name_indexed = pysam.IndexedReads(bamfile)
    name_indexed.build()
    header = bamfile.header.copy()
    out = pysam.Samfile(options.out, 'wb', header=header)
    for name in n:
        try:
            name_indexed.find(name)
        except KeyError:
            pass
        else:
            iterator = name_indexed.find(name)
            for x in iterator:
                out.write(x)
def unmerge_bams(out_file_prefix, valid_cells, n_cores):
    input_file = out_file_prefix + ".merged.aligned.sorted.bam"

    if not os.path.isdir(out_file_prefix + "_deindexed_bam"):
        os.mkdir(out_file_prefix + "_deindexed_bam")

    with ps.AlignmentFile(input_file, "rb") as f_in:
        indexed = ps.IndexedReads(f_in)
        indexed.build()
        header = f_in.header.copy()

        for cell_id in valid_cells:
            out_file_name = out_file_prefix + "_deindexed_bam/" + cell_id + ".bam"
            with ps.AlignmentFile(out_file_name, "wb", header=header) as f_out:
                iterator = indexed.find(cell_id)
                for i in iterator:
                    f_out.write(i)
Esempio n. 16
0
def write_anomalous_read_to_bam(bam,split_reads,span_reads,anom_reads,out):
    print('Writing anom reads to file')
    split_reads = np.unique(split_reads['query_name'])
    span_reads = np.unique(span_reads['query_name'])
    anom_reads = np.unique(anom_reads['query_name'])

    # need to filter out any reads that were at any point marked as valid supporting reads
    anom_reads = np.array([x for x in anom_reads if x not in split_reads])
    anom_reads = np.array([x for x in anom_reads if x not in span_reads])

    bamf = pysam.AlignmentFile(bam, "rb")
    index = pysam.IndexedReads(bamf)
    index.build()
    anom_bam = pysam.AlignmentFile("%s_anom_reads.bam" % out, "wb", template=bamf)
    for read_name in anom_reads:
        for read in index.find(read_name):
            anom_bam.write(read)
    anom_bam.close()
Esempio n. 17
0
def primer_filter(basename, vector_fa, genome, bait_chr, bait_strand,
                  sgRNA_start, sgRNA_end):

    directory_store = vector_fa.rpartition('.')[0]
    print("[PEM-Q Vector Analysis]  processing primer filter...")
    pe_bam_sort = basename + '_pe_vector.sort.bam'
    pe_primer_bam = basename + '_primer_vector.bam'
    pe_primer_bam_sort = basename + '_primer_vector.sort.bam'

    primer_list_file = pd.read_csv("primer/bamlist_stitch.txt",
                                   sep=' ',
                                   names=["Qname", "Bait_start", "Bait_end"])
    primer_list = primer_list_file["Qname"]
    vector_pe_bam = pysam.AlignmentFile(directory_store + "/" + pe_bam_sort,
                                        'rb')
    vector_primer_bam = pysam.AlignmentFile(directory_store + "/" +
                                            pe_primer_bam,
                                            "wb",
                                            template=vector_pe_bam)

    vector_pe_bam_indexed = pysam.IndexedReads(vector_pe_bam)
    vector_pe_bam_indexed.build()
    n = 0
    for name in primer_list:
        try:
            vector_pe_bam_indexed.find(name)
        except KeyError:
            pass
        else:

            iterator = vector_pe_bam_indexed.find(name)
            for x in iterator:
                n = n + 1
                vector_primer_bam.write(x)
    print("primer filter left:", n)
    vector_pe_bam.close()
    vector_primer_bam.close()
    pysam.sort("-o", directory_store + "/" + pe_primer_bam_sort,
               directory_store + "/" + pe_primer_bam)
    cmd = "samtools index {}/{}".format(directory_store, pe_primer_bam_sort)
    os.system(cmd)
Esempio n. 18
0
def filter_reads(alignment_file, readdb, read_dirs, quality_threshold=7, recursive=False, trim=False):
    """Filter fast5 files based on a quality threshold and if there is an alignment
    :param alignment_file: bam aligment file
    :param readdb: readdb or sequence summary file
    :param read_dirs: list of directories
    :param quality_threshold: phred quality score min threshold for passing
    :param recursive: search directories recursively for more fast5 dirs
    :param trim: number of bases to analyze
    """
    assert alignment_file.endswith("bam"), "Alignment file must be in BAM format: {}".format(alignment_file)
    # grab aligned segment
    if trim:
        assert isinstance(trim, int), "Trim needs to be an integer: {}".format(trim)
    else:
        trim = np.inf
    n_bases = 0
    n_files = 0
    with closing(pysam.AlignmentFile(alignment_file, 'rb')) as bamfile:
        name_indexed = pysam.IndexedReads(bamfile)
        name_indexed.build()
        for name, fast5 in parse_read_name_map_file(readdb, read_dirs, recursive=recursive):
            try:
                if trim < n_bases:
                    print("Filtered {} files for {} bases".format(n_files, n_bases))
                    break

                iterator = name_indexed.find(name)
                for aligned_segment in iterator:
                    if aligned_segment.is_secondary or aligned_segment.is_unmapped \
                            or aligned_segment.is_supplementary or aligned_segment.has_tag("SA"):
                        continue
                    # get data and sanity check
                    if aligned_segment.query_qualities is not None:
                        if np.mean(aligned_segment.query_qualities) < quality_threshold:
                            continue
                    n_files += 1
                    n_bases += aligned_segment.query_length
                    yield fast5, aligned_segment

            except KeyError:
                print("Found no alignments for {}".format(fast5))
Esempio n. 19
0
 def extract_reads_by_name_list(self, sf_names, sf_bam, sf_out_bam):
     l_names = self.load_read_names(sf_names)
     bamfile = pysam.AlignmentFile(sf_bam,
                                   'rb',
                                   reference_filename=self.sf_reference)
     name_indexed = pysam.IndexedReads(
         bamfile)  # here use hashing to save the read names in the memory
     name_indexed.build()
     header = bamfile.header.copy()
     out = pysam.Samfile(sf_out_bam, 'wb', header=header)
     for name in l_names:
         try:
             name_indexed.find(name)
         except KeyError:
             pass
         else:
             iterator = name_indexed.find(name)
             for x in iterator:  # x is an alignment
                 ###here need to check whether this is the first or second read we wanted!!!!
                 out.write(x)
     out.close()
Esempio n. 20
0
def get_cigar(file_path, name):
    bamfile = sam.AlignmentFile(file_path, "rb")
    idx = sam.IndexedReads(bamfile)
    idx.build()
    name = idx.find(name)

    cigar_align = []

    for read in name:
        # tmp = read.get_blocks()
        if (not (read.is_unmapped)):  # if it's mapped
            cigarLine = read.cigar

            for (cigarType, cigarLength) in cigarLine:
                try:
                    if (cigarType == 0):  # match
                        for i in range(cigarLength):
                            cigar_align.append('.')
                    elif (cigarType == 1):  # insertions
                        for i in range(cigarLength):
                            cigar_align.append('i')
                    elif (cigarType == 2):  # deletion
                        for i in range(cigarLength):
                            cigar_align.append('d')
                    elif (cigarType == 3):  # skip
                        for i in range(cigarLength):
                            cigar_align.append('s')
                    elif (cigarType == 4):  # soft clipping
                        continue
                    elif (cigarType == 5):  # hard clipping
                        continue
                    elif (cigarType == 6):  # padding
                        for i in range(cigarLength):
                            cigar_align.append('p')
                    else:
                        print("Wrong CIGAR number")
                        sys.exit(1)
                except:
                    print("Problem")
        return cigar_align
Esempio n. 21
0
def extract_reads(options):
    n = get_names(options.names)
    bamfile = pysam.AlignmentFile(options.bam, 'rb')
    name_indexed = pysam.IndexedReads(bamfile)
    name_indexed.build()
    out = open(options.out, 'w')
    for name in n:
        try:
            name_indexed.find(name)
        except KeyError:
            pass
        else:
            iterator = name_indexed.find(name)
            for x in iterator:
                start = x.reference_start
                end = x.reference_end
                mq = x.mapping_quality
                #chrom=x.target_name
                chrom = x.reference_name
                rl = x.query_length
                out.write("%s %s %s %s %s %s\n" %
                          (name, chrom, start, end, mq, rl))
Esempio n. 22
0
def extract_reads_cigar(options):
    reads_file = '%s/read_names.txt' % (options.outdir)
    n = get_names(reads_file)
    bamfile = pysam.AlignmentFile(options.bam, 'rb')
    name_indexed = pysam.IndexedReads(bamfile)
    name_indexed.build()
    header = bamfile.header.copy()
    # out = pysam.Samfile(options.out, 'wb', header=header)
    for name in n:
        try:
            name_indexed.find(name)
        except KeyError:
            pass
        else:
            iterator = name_indexed.find(name)
            n = 0
            for x in iterator:
                n += 1
            multi_align_list = []
            iterator = name_indexed.find(name)
            for x in iterator:
                #if n == 2 or x.cigar == [(0, 150)]:
                # print (name, x.reference_name, x, x.cigar,cigar2array(x.cigar))
                #   continue
                if x.is_unmapped:
                    continue
                cigararray = cigar2array(x.cigar)
                # print (x.cigar)
                s = 0
                # locus_set = x.get_reference_positions(full_length=True)
                t_name = x.reference_name

                # pre_ref, pre_read = '', ''
                for i in range(len(cigararray)):
                    base_quality = x.query_qualities[i]
                    p = 10**(-base_quality / 10)
                    cigar = cigararray[i]
                    print(p, cigar)
Esempio n. 23
0
def multiprocess_get_summary_info(alignment_file, readdb, read_dirs, get_summary_args, worker_count=1, debug=False):
    """Multiprocess get summary info"""
    assert alignment_file.endswith("bam"), "Alignment file must be in BAM format: {}".format(alignment_file)
    # grab aligned segment
    data = pd.DataFrame([])

    with closing(pysam.AlignmentFile(alignment_file, 'rb')) as bamfile:
        name_indexed = pysam.IndexedReads(bamfile)
        print("Indexing bam file by read name.")
        name_indexed.build()
        print("Finished.")
        print("Looping through readdb file.")
        if debug:
            for name, fast5, iterator in parse_readdb_wrapper(parse_readdb(readdb, read_dirs), name_indexed):
                pd_line = create_summary_pd(iterator, fast5, name, **get_summary_args)
                data = data.append(pd_line)
        else:
            total, failure, messages, output = multithread.run_service2(
                get_summary_info_service, parse_readdb_wrapper(parse_readdb(readdb, read_dirs), name_indexed),
                get_summary_args, ['name', "fast5", "sam_lines"], worker_count)
            for pd_line in output:
                if isinstance(pd_line, pd.DataFrame):
                    data = data.append(pd_line)
    return data
Esempio n. 24
0
def readAndFilterBam(config):
    print("Read bam", config.bam_file)

    bam_file_all = pysam.AlignmentFile(filepath_or_object=config.bam_file,
                                       mode='rb')
    index = pysam.IndexedReads(bam_file_all)
    index.build()

    reads_names = set()
    for ch in config.contigs:
        for read in bam_file_all.fetch(
                ch,
                0,
        ):
            reads_names.add(read.query_name)

    result = []
    for read_name in reads_names:
        reads = [
            r for r in index.find(read_name)
            if validRead(r, config) and not r.is_secondary
        ]
        readsInChromosoms = [
            r for r in reads if r.reference_name in config.contigs
        ]
        if len(reads) < 2 or not readsInChromosoms:
            continue
        segments = [Segment(read) for read in reads]
        segments.sort(key=lambda x: x.query_alignment_start)
        read_aligment = calculateAlignment(segments)
        # print("alignment", read_aligment * 100, "%")
        if read_aligment < config.min_read_aligment:
            continue
        result.append(segments)

    return result, bam_file_all
Esempio n. 25
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="""Split a BAM file containing alignments from 
                                     a SLAM/TUC/TL-seq experiment into labelled/unlabelled BAM files.
                                     Requires the MD tag. All reads are kept incl. unmapped, 
                                     secondary/supplementary, etc. these can be filtered out when 
                                     counting (featureCounts). Optionally, SNPs can be subtracted."""
    )

    parser.add_argument('bam', help="""The input BAM file (full path).""")

    parser.add_argument('outdir_bam',
                        help="""The output directory (BAM files).""")

    parser.add_argument(
        'outdir_mm', help="""The output directory (mismatch information).""")

    parser.add_argument('name',
                        help="""The output base name without extension.""")

    parser.add_argument('-s',
                        '--subtract',
                        help="""SNPs to be subtracted (GS default format)""",
                        type=str)

    parser.add_argument('--vcf',
                        help="""Use this flag if SNPs are in VCF format""",
                        action='store_true')

    parser.add_argument('-ref',
                        '--ref-base',
                        help="""Conversion reference base.""",
                        choices=['A', 'C', 'G', 'T'],
                        default='T')

    parser.add_argument('-bc',
                        '--base-change',
                        help="""Conversion base (substitution/mismatch).""",
                        choices=['A', 'C', 'G', 'T'],
                        default='C')

    parser.add_argument(
        '-q',
        '--base-qual',
        help="The minimum base quality for any given mismatch (default: 20).",
        type=int,
        default=20)

    parser.add_argument(
        '--trim5p',
        help="The number bases to trim at the 5' ends of reads (default: 0).",
        type=int,
        default=0)

    parser.add_argument(
        '--trim3p',
        help="The number bases to trim at the 3' ends of reads (default: 0).",
        type=int,
        default=0)

    parser.add_argument('--overwrite',
                        help='''If this flag is present, then existing files
        will be overwritten.''',
                        action='store_true')

    parser.add_argument('-t',
                        '--tmp',
                        help="""Optional argument: where to write 
        temporary files. If not specified, programs-specific tmp will be used.""",
                        default=None)

    utils.add_sbatch_options(parser,
                             num_cpus=default_num_cpus,
                             mem=default_mem)
    utils.add_logging_options(parser)
    args = parser.parse_args()
    utils.update_logging(args)

    msg = "[splbam]: {}".format(' '.join(sys.argv))
    logger.info(msg)

    # if using slurm, submit the script
    if args.use_slurm:
        cmd = "{}".format(' '.join(shlex.quote(s) for s in sys.argv))
        utils.check_sbatch(cmd, args=args)
        return

    # check output path
    exist = utils.check_files_exist([args.outdir_bam, args.outdir_mm],
                                    raise_on_error=True,
                                    logger=logger)

    # check that all files exist
    input_files = [args.bam]
    if args.subtract:
        input_files.append(args.subtract)
    exist = utils.check_files_exist(input_files,
                                    raise_on_error=True,
                                    logger=logger)

    # create the output files - BAM
    labelled_filename = '{}.labelled.unsrt.bam'.format(args.name)
    labelled_filename = os.path.join(args.outdir_bam, labelled_filename)

    sorted_labelled_filename = '{}.labelled.bam'.format(args.name)
    sorted_labelled_filename = os.path.join(args.outdir_bam,
                                            sorted_labelled_filename)

    unlabelled_filename = '{}.unlabelled.unsrt.bam'.format(args.name)
    unlabelled_filename = os.path.join(args.outdir_bam, unlabelled_filename)

    sorted_unlabelled_filename = '{}.unlabelled.bam'.format(args.name)
    sorted_unlabelled_filename = os.path.join(args.outdir_bam,
                                              sorted_unlabelled_filename)

    # create the output files - mismatches
    mismatch_details_filename = '{}.mismatchDetails.tab.gz'.format(args.name)
    mismatch_details_filename = os.path.join(args.outdir_mm,
                                             mismatch_details_filename)

    mismatch_filename = '{}.mismatches.tab.gz'.format(args.name)
    mismatch_filename = os.path.join(args.outdir_mm, mismatch_filename)

    mismatch_filename_final = '{}.mismatches-used.tab.gz'.format(args.name)
    mismatch_filename_final = os.path.join(args.outdir_mm,
                                           mismatch_filename_final)

    # and check if exist
    out_files = [
        labelled_filename, sorted_labelled_filename, unlabelled_filename,
        sorted_unlabelled_filename, mismatch_details_filename,
        mismatch_filename, mismatch_filename_final
    ]
    all_out_exists = all([os.path.exists(of) for of in out_files])
    if args.overwrite or not all_out_exists:
        pass
    else:
        msg = "All output files {} already exist. Skipping call.".format(
            out_files)
        logger.warning(msg)
        return

    msg = "Getting all mismatches"
    logger.info(msg)

    # first get all mismatches
    bam = ps.AlignmentFile(args.bam, "rb")
    SN = (SQ['SN'] for SQ in bam.header['SQ'])
    bam.close()

    mismatch_and_details = utils.apply_parallel_iter(SN,
                                                     args.num_cpus,
                                                     get_mismatches,
                                                     args.bam,
                                                     progress_bar=False,
                                                     backend='multiprocessing')

    all_details = [a for a, b in mismatch_and_details]
    mismatch_count = get_mismatch_details(all_details,
                                          mismatch_details_filename,
                                          mismatch_filename)

    all_mismatches = [b for a, b in mismatch_and_details]
    all_mismatches = pd.concat(all_mismatches)

    # get the conversion of interest
    bc = get_base_vec(args.base_change, '+')
    m_bc = all_mismatches['bases11'] == bc
    m_ref = all_mismatches['ref'] == args.ref_base
    all_mismatches = all_mismatches[m_ref & m_bc]

    # filter base quality - no offset of 33 needs to be subtracted
    m_qual = all_mismatches['base_qual'] >= args.base_qual

    # below we keep track of discarded mismatches to adjust rates
    discarded = all_mismatches[~m_qual].copy()
    m = (discarded['read1'] == True) & (discarded['score'] == True)
    discarded_first = discarded[m].shape[0]
    m = (discarded['read1'] == False) & (discarded['score'] == True)
    discarded_second = discarded[m].shape[0]

    all_mismatches = all_mismatches[m_qual]

    # discard mismatches found at read ends
    m_trim5p = all_mismatches['m_pos'] < (args.trim5p -
                                          all_mismatches['qstart'])
    m_trim3p = all_mismatches['m_pos'] >= (all_mismatches['rlen'] -
                                           args.trim3p)
    discarded = all_mismatches[m_trim5p | m_trim3p].copy()
    m = (discarded['read1'] == True) & (discarded['score'] == True)
    discarded_first += discarded[m].shape[0]
    m = (discarded['read1'] == False) & (discarded['score'] == True)
    discarded_second += discarded[m].shape[0]

    all_mismatches = all_mismatches[~m_trim5p & ~m_trim3p]

    # remove all SNPs from what remains
    if args.subtract:
        # currently GRAND-SLAM snpdata default format
        if args.vcf:
            snps = utils.fmt_convert(args.subtract)
        else:
            snps = pd.read_csv(args.subtract, sep='\t')
        snps = snps.Location.unique()
        # add field
        all_mismatches['Location'] = all_mismatches[['contig', 'start']].apply(
            lambda x: ':'.join([str(s) for s in x]), axis=1)

        discarded = all_mismatches[all_mismatches.Location.isin(snps)].copy()
        m = (discarded['read1'] == True) & (discarded['score'] == True)
        discarded_first += discarded[m].shape[0]
        m = (discarded['read1'] == False) & (discarded['score'] == True)
        discarded_second += discarded[m].shape[0]

        all_mismatches = all_mismatches[~all_mismatches.Location.isin(snps)]

    # adjust final mismatch counts
    m = (mismatch_count.Orientation == 'First') & (
        mismatch_count.Genomic == 'A') & (mismatch_count.Read == 'G')
    mismatch_count.loc[
        m,
        'Mismatches'] = mismatch_count.loc[m, 'Mismatches'] - discarded_first
    n = (mismatch_count.Orientation == 'Second') & (
        mismatch_count.Genomic == 'T') & (mismatch_count.Read == 'C')
    mismatch_count.loc[
        n,
        'Mismatches'] = mismatch_count.loc[n, 'Mismatches'] - discarded_second
    mismatch_count = mismatch_count[m | n]
    mismatch_count.to_csv(mismatch_filename_final,
                          sep='\t',
                          index=False,
                          compression='gzip')

    # what remains are true conversions, other reads are classified as unlabelled
    # NOTE: we keep all query_name for which at least one read has a mismatch
    # this include read pairs, but also multi-mapping reads
    true_conversions = all_mismatches.name.unique()

    # now split the BAM file
    msg = "Reading the alignments and splitting the input BAM file"
    logger.info(msg)

    # requires a lot of memory however...
    bam = ps.AlignmentFile(args.bam, "rb")
    qname_index = ps.IndexedReads(bam)
    qname_index.build()

    # we first "split" the query names, sort by query name and write each file in turn
    # we don't sort the lists, this will not be faster, the index is just fine
    true_conversions = set(true_conversions)
    all_qnames = set([a.query_name for a in bam.fetch(until_eof=True)])
    all_qnames = all_qnames - true_conversions

    labelled = ps.AlignmentFile(labelled_filename, "wb", template=bam)
    unlabelled = ps.AlignmentFile(unlabelled_filename, "wb", template=bam)

    # labelled/new
    for qname in true_conversions:
        alignments = qname_index.find(qname)
        for a in alignments:
            labelled.write(a)
    labelled.close()

    # unlabelled/old
    for qname in all_qnames:
        alignments = qname_index.find(qname)
        for a in alignments:
            unlabelled.write(a)
    unlabelled.close()

    bam.close()

    # create the bamtools index if it does not already exists
    args.num_cpus = 6  # limit... otherwise this is problematic?!
    args.keep_intermediate_files = False  # delete unsorted bam file

    utils.sort_bam_file(labelled_filename, sorted_labelled_filename, args)
    utils.index_bam_file(sorted_labelled_filename, args)

    utils.sort_bam_file(unlabelled_filename, sorted_unlabelled_filename, args)
    utils.index_bam_file(sorted_unlabelled_filename, args)
Esempio n. 26
0
def get_alignment_summary_info_withdb(alignment_file, readdb, read_dirs, pass_threshold=7,
                                      gap_size=10, verbose=False, max_reads=100, number=0):
    """Filter fast5 files based on a quality threhsold and if there is an alignment"""
    assert alignment_file.endswith("bam"), "Alignment file must be in BAM format: {}".format(alignment_file)
    # grab aligned segment
    seen_counter = 0

    with closing(pysam.AlignmentFile(alignment_file, 'rb')) as bamfile:
        name_indexed = pysam.IndexedReads(bamfile)
        print("Indexing bam file by read name.")
        name_indexed.build()
        print("Finished.")
        print("Looping through readdb file.")
        for name, fast5 in parse_readdb(readdb, read_dirs):
            try:
                iterator = name_indexed.find(name)
                # create ability to only grab x number of reads
                if seen_counter >= max_reads:
                    break
                # need to start the data table with first row
                if seen_counter == 0:
                    pd_data = get_summary_info_row(name)
                    big_table = pd_data
                else:
                    big_table.append(pd_data)
                    pd_data = get_summary_info_row(name)

                # start tracking data
                pd_data["seen"] = 1
                seen_counter += 1

                cl_handle = CreateLabels(fast5, kmer_index=2)
                seq_start_time = cl_handle.raw_attributes['start_time']
                pd_data["seq_start_time"] = seq_start_time

                for aligned_segment in iterator:
                    if aligned_segment.is_secondary or aligned_segment.is_unmapped \
                            or aligned_segment.is_supplementary or aligned_segment.has_tag("SA"):
                        if aligned_segment.is_secondary:
                            pd_data["num_secondary_mappings"] += 1
                        if aligned_segment.is_unmapped:
                            pd_data["no_mapping"] = 1
                        if aligned_segment.is_supplementary or aligned_segment.has_tag("SA"):
                            pd_data["chimera_mapping"] += 1
                    else:
                        pd_data["map_q"] = aligned_segment.mapq
                        soft_clipped_percentage = \
                            1 - float(len(aligned_segment.query_alignment_sequence)) / len(aligned_segment.query_sequence)
                        pd_data["soft_clipped_percentage"] = soft_clipped_percentage

                        handle = AlignmentSegmentWrapper(aligned_segment)
                        handle.initialize()

                        accuracy = handle.alignment_accuracy()
                        pd_data["basecalled_accuracy"] = accuracy
                        try:
                            mea = cl_handle.add_mea_labels(number=int(number))
                            sa_full = cl_handle.add_signal_align_predictions(number=int(number), add_basecall=True)
                            all_basecall_data = []
                            for name, basecall_data in cl_handle.aligned_signal.prediction.items():
                                if "guide" in name:
                                    all_basecall_data.extend(basecall_data)

                            alignment_summary = analyze_event_skips(mea, sa_full, all_basecall_data, generate_plot=False)
                            flagged_gaps_summary = flag_large_gaps(alignment_summary, gap_size, verbose=verbose)
                            counter = 0
                            total_distance = 0
                            for gap in flagged_gaps_summary:
                                if gap["mea_peak_distance"] > 10:
                                    counter += 1
                                    total_distance += gap["mea_peak_distance"]
                            if counter > 0:
                                pd_data["num_flagged_gaps"] = counter
                                pd_data["avg_flagged_gap_size"] = float(total_distance) / counter

                            q_score_average = 0
                            if aligned_segment.query_qualities is None:
                                print("Alignment done with fasta instead of fastq so read qualities will not be reported")
                            else:
                                q_score_average = np.mean(aligned_segment.query_qualities)

                            pd_data["q_score_average"] = q_score_average
                            print("pd_data['q_score_average']", pd_data["q_score_average"][0])
                            if pd_data["q_score_average"][0] > pass_threshold:
                                pd_data["pass"] = 1

                        except Exception as e:
                            pd_data["other_errors"] = 1
                            print("ERROR {}: {}".format(fast5, e), file=sys.stderr)

            except KeyError:
                pd_data["other_errors"] = 1
                print("Found no alignments for {}".format(fast5))

        return big_table
import pysam
from collections import defaultdict

# load and index the CellRanger BAM file
cr_bam = pysam.AlignmentFile(snakemake.input[0], mode="rb")
cr_idx = pysam.IndexedReads(cr_bam)
cr_idx.build()

# load and iterate through the PathSeq BAM file
pathseq_bam = pysam.AlignmentFile(snakemake.input[1], mode="rb")

output = []
# d = defaultdict(lambda: defaultdict(list))
# seg is an AlignedSegment object
for seg in pathseq_bam.fetch(until_eof=True):
    # returns an IteratorRowSelection object, which contains one or more AlignedSegment object
    cr_list = list(cr_idx.find(seg.query_name))
    # we assume that all records belonging to the same query name will have the same CB/UB tag
    # not all records will have the CB tag and the UB tag
    if cr_list[0].has_tag("CB") and cr_list[0].has_tag("UB"):
        CB = cr_list[0].get_tag(tag="CB")
        UB = cr_list[0].get_tag(tag="UB")
        # using set_tags removes all other tags - use set_tag instead
        seg.set_tag("CB", CB, "Z")
        seg.set_tag("UB", UB, "Z")
        # d[CB][UB].append(seg)
    # keep all PathSeq alignments
    output.append(seg)

# write all PathSeq alignments with or without tags
all_pathseq_bam = pysam.AlignmentFile(snakemake.output[0],
Esempio n. 28
0
def extract_reads(options):
    print('start assigning reads...')
    reads_file = '%s/read_names.txt' % (options.outdir)
    assign_file = '%s/assign_file.txt' % (options.outdir)
    out = open(assign_file, 'w')
    n = get_names(reads_file)
    bamfile = pysam.AlignmentFile(options.bam, 'rb')
    name_indexed = pysam.IndexedReads(bamfile)
    name_indexed.build()
    error, total, remove = 0, 0, 0
    error_set = []
    for name in n:
        try:
            iterator = name_indexed.find(name)
            dict = {}
            pair_dict = {}
            len_dict = {}
            for x in iterator:
                if x.is_unmapped:
                    continue
                s = 0
                # locus_set = x.get_reference_positions(full_length=True)
                t_name = x.reference_name
                # if name == 'A00132:58:HFL2TDSXX:4:2276:6515:29183':
                #     print ('#', t_name, x.next_reference_name)
                if t_name != x.next_reference_name:
                    continue
                # print (t_name, x.next_reference_name)
                # if not re.search('DRB1', t_name):
                #     continue
                match_num = 0
                soft_num = 0
                all_num = 0
                for ci in x.cigar:
                    if ci[0] == 0:
                        match_num += ci[1]
                    elif ci[0] == 4:
                        soft_num += ci[1]
                    all_num += ci[1]
                if soft_num > 0:
                    continue

                mis_NM = 0
                for ta in x.get_tags():
                    if ta[0] == 'NM':
                        match_num -= ta[1]
                        mis_NM += ta[1]
                if mis_NM > options.max_nm:
                    continue
                    # print (ta, match_num)
                # match_rate = match_num/(all_num - soft_num)
                focus_len = all_num - soft_num

                # if name == 'A00217:72:HFKHYDSXX:4:2603:32542:29637':
                #     print (x.cigar, match_rate, all_num, soft_num)
                # refSequence = x.get_reference_sequence()
                # pre_ref, pre_read = '', ''
                # j = 0
                # for i in range(len(locus_set)):
                #     read_allele = x.query_sequence[i].upper()
                #     if str(locus_set[i]) != 'None':
                #         ref_allele = refSequence[j].upper()
                #         j += 1
                #     else:
                #         ref_allele = 'NONE'
                #     base_quality = x.query_qualities[i]
                #     p = 10 ** (- base_quality / 10)

                #     alpha = calculate_alpha(read_allele, ref_allele, pre_ref, pre_read, p, options)
                #     beta = calculate_beta(read_allele, options)
                #     # print (p, alpha, beta)
                #     s += alpha
                #     s += beta
                #     pre_ref, pre_read = ref_allele, read_allele
                if t_name not in dict.keys():
                    dict[t_name] = match_num  #round(s,3)
                    len_dict[t_name] = focus_len
                    pair_dict[t_name] = 1
                else:
                    dict[t_name] += match_num  #round(s,3)
                    len_dict[t_name] += focus_len
                    pair_dict[t_name] += 1
            #evaluation
            total += 1
            if len(dict) == 0:
                continue

            # first_align = l[0][0].split('*')[0]
            # reads_len = #list(len_dict.values())[0]
            for key in dict.keys():
                # print (len_dict[key])
                if len_dict[key] < 0:  #make sure the reads is paired mapped.
                    dict[key] = 0
                else:
                    dict[key] = float(dict[key]) / len_dict[key]
            first_align = check_score(dict, options, name, pair_dict)
            # print (dict, first_align, reads_len)
            # break
            if first_align == 'REMOVE':
                remove += 1
                continue
            print(name, first_align, file=out)
        except KeyError:
            pass
    out.close()
Esempio n. 29
0
    def filter_multiple_adapter(self):

        chek_file = self.adapter_bam_check_sort

        if not os.path.exists(chek_file):

            if not self.fastq_check:
                raise ValueError(
                    'merged fastq is needed for adapter check alignment.')
            if not os.path.exists("adapter/adapter.fa"):
                raise ValueError(
                    'adapter/adapter.fa is needed for adapter check alignment.'
                )

            #alignment
            print("[PEM-Q]  align to check adapter...")

            cmd = "bwa mem -t 8 adapter/adapter -k 5 -L 0 -T 14 {} > {} 2>barcode/bwa_align_adapter.log".format(
                self.fastq_check, self.adapter_sam_check)
            os.system(cmd)
            print("[PEM-Q] " + cmd)
            cmd = "samtools view -S -b -h {} > {} \
                   && samtools sort {} > {} \
                   && samtools index {}".format(self.adapter_sam_check,
                                                self.adapter_bam_check,
                                                self.adapter_bam_check,
                                                self.adapter_bam_check_sort,
                                                self.adapter_bam_check_sort)
            print("[PEM-Q]  sort and index bam...")
            os.system(cmd)
        else:
            print("[PEM-Q]  adapter check alignment file exist, jump...")

        #keep record of multiple adapters
        multiple_adapt = open("barcode/" + self.basename +
                              "_multiple_adapt.txt",
                              "w")  #生成储存含有multiple adapter的文件
        clean_adapt = open("barcode/" + self.basename + "_clean_adapt.txt",
                           "w")  #生成储存没有multiple adapter的文件
        bam_file = pysam.AlignmentFile(self.adapter_bam_check_sort,
                                       "rb")  #读入sort后的bam文件
        multiple_adapt_list = []
        clean_adapt_list = []
        for read in bam_file:
            condition1 = any(
                'SA' == tg[0]
                for tg in read.get_tags())  #判断该read是否有多条supplementary的比对结果
            if condition1:
                multiple_adapt_list.append(read.query_name)
                multiple_adapt.write(read.query_name + "\n")
            else:
                clean_adapt_list.append(read.query_name)
                clean_adapt.write(read.query_name + "\n")
        multiple_adapt.close()
        clean_adapt.close()
        bam_file.close()
        #remove reads with multiple adapters
        primer_bam = pysam.AlignmentFile(self.primer_bam, 'rb')
        dedup_bam_sort = primer_bam
        filter_bam = pysam.AlignmentFile(self.filter_bam,
                                         "wb",
                                         template=dedup_bam_sort)
        name_indexed = pysam.IndexedReads(dedup_bam_sort)
        name_indexed.build()
        for name in clean_adapt_list:
            try:
                name_indexed.find(name)
            except KeyError:
                pass
            else:
                iterator = name_indexed.find(name)
                for x in iterator:
                    filter_bam.write(x)
        dedup_bam_sort.close()
        filter_bam.close()
        pysam.sort("-o", self.filter_bam_sort, self.filter_bam)
        primer_bam.close()
        return ()
Esempio n. 30
0
            continue
        if targetPos in refPairedPositions:
            readTargetPos = queryPairedPositions[refPairedPositions.index(
                targetPos)]
            if readTargetPos != None:
                readTargetBase = entry.query_sequence[readTargetPos]
                if readTargetBase.upper() == targetBase:
                    selectedReadNames.add(entry.query_name)

sys.stderr.write("\nFound {} entries carrying a target base\n".format(
    len(selectedReadNames)))

#index file for pulling out reads
sys.stderr.write(
    "\nBuilding read name index for extracting selected pairs...\n")
readIndex = pysam.IndexedReads(inBam)
readIndex.build()

sys.stderr.write("\nWriting {} selected entries...\n".format(
    len(selectedReadNames)))
for readName in selectedReadNames:
    entries = readIndex.find(readName)
    for entry in entries:
        outBam.write(entry)

inBam.close()
outBam.close()
targetsFile.close()

sys.stderr.write("\nSorting output bam file...\n")