def run_bam2sam(x): x_dirname, x_basename = dir_name(x), base_name(x) y = join_path(x_dirname, x_basename + '.sam') _ = popen_communicate('samtools view -h {} -o {}'.format(x, y)) if is_nonempty_file(y): print_logger('Finished: bam to sam {} to {}'.format(x, y)) rmfile(x) else: print_logger('Failed: bam to sam {} to {}'.format(x, y))
def demultiplex_sam(samfile, outdir, bc_length): if not samfile: return samobj = pysam.AlignmentFile(samfile, 'rb') dict_samout = {} for aln in samobj: bc = _cell_seq(aln.query_name, length=bc_length) fh = dict_samout.get(bc, None) if not fh: outsam = join_path(outdir, bc + '.sam') fh = pysam.AlignmentFile(outsam, 'w', template=samobj) dict_samout[bc] = fh fh.write(aln) for _, fh in dict_samout.items(): fh.close()
def main(): p = get_argument_parser() args = p.parse_args() SUBDIR_FASTQ = 'small_fq' SUBDIR_ALIGN = 'small_sam' fqs = glob.glob(join_path(args.project_dir, SUBDIR_FASTQ, '*', '*.fastq'), recursive=True) fqs_unknown = glob.glob(join_path(args.project_dir, SUBDIR_FASTQ, '*', 'UNKNOWN', '*.fq'), recursive=True) sams = glob.glob(join_path(args.project_dir, SUBDIR_ALIGN, '*', '*.sam'), recursive=True) if args.dryrun: print_logger(('{} fastqs are ' 'to be gzipped. ').format(len(fqs + fqs_unknown))) print_logger('{} sams are to be converted to bam'.format(len(sams))) return 0 subdir_fastq_size0 = dirsize_str(join_path(args.project_dir, SUBDIR_FASTQ)) subdir_align_size0 = dirsize_str(join_path(args.project_dir, SUBDIR_ALIGN)) p = Pool(args.cores) for fq in fqs + fqs_unknown: p.apply_async(run_gzip_fastq, args=(fq,)) for sam in sams: p.apply_async(run_sam2bam, args=(sam,)) p.close() p.join() subdir_fastq_size1 = dirsize_str(join_path(args.project_dir, SUBDIR_FASTQ)) subdir_align_size1 = dirsize_str(join_path(args.project_dir, SUBDIR_ALIGN)) print_logger('Storage of FASTQs: {} => {}'.format(subdir_fastq_size0, subdir_fastq_size1)) print_logger('Storage of Alignments: {} => {}'.format(subdir_align_size0, subdir_align_size1)) return 0
def demultiplex_sam_with_claim(samfile, outdir, bc_length, claimed_bc): if not samfile: return if not claimed_bc: return samobj = pysam.AlignmentFile(samfile, 'rb') dict_samout = {} for bc in claimed_bc: fh = pysam.AlignmentFile(join_path(outdir, bc + '.sam'), 'w', template=samobj) dict_samout[bc] = fh for aln in samobj: bc = _cell_seq(aln.query_name, length=bc_length) fh = dict_samout.get(bc, None) if not fh: continue fh.write(aln) for _, fh in dict_samout.items(): fh.close()
def demultiplexing(read1_fpath, read2_fpath, dict_bc_id2seq, outdir, start_umi=0, start_bc=6, len_umi=6, len_bc=6, len_tx=35, bc_qual_min=10, is_gzip=True, save_unknown_bc_fastq=False, tagging_only=False, tag_to='tagged.fastq', do_bc_rev_complement=False, do_tx_rev_complement=False, verbose=False): """ Demultiplexing to fastq files based on barcode sequence. """ if is_gzip: fh_umibc = filehandle_fastq_gz(read1_fpath) fh_tx = filehandle_fastq_gz(read2_fpath) else: fh_umibc = open(read1_fpath, 'rt') fh_tx = open(read2_fpath, 'rt') sample_counter = Counter() bc_fhout = dict() for bc_id, bc_seq in dict_bc_id2seq.items(): # bc_id = '[{}]'.format('-'.join(map(str, bc_id))) bc_fhout[bc_seq] = join_path(outdir, 'BC-{}-{}.fastq'.format(bc_id, bc_seq)) mkfolder(join_path(outdir, 'UNKNOWN')) bc_fhout['UNKNOWNBC_R1'] = join_path(outdir, 'UNKNOWN', 'UNKNOWNBC_R1.fq') bc_fhout['UNKNOWNBC_R2'] = join_path(outdir, 'UNKNOWN', 'UNKNOWNBC_R2.fq') if tagging_only: out_fpath_tagged_fq = join_path(outdir, tag_to) out_fh_tagged_fq = open(out_fpath_tagged_fq, 'w') for bc_seq, v in bc_fhout.items(): if bc_seq.startswith('UNKNOWN'): bc_fhout[bc_seq] = open(v, 'w') continue if tagging_only: bc_fhout[bc_seq] = out_fh_tagged_fq else: bc_fhout[bc_seq] = open(v, 'w') i = 0 while (True): if verbose and i % 1000000 == 0: print_logger('Processing {:,} reads...'.format(i)) try: umibc_name = next(fh_umibc).rstrip() umibc_seq = next(fh_umibc).rstrip() next(fh_umibc) umibc_qualstr = next(fh_umibc).rstrip() tx_name = next(fh_tx).rstrip() tx_seq = next(fh_tx).rstrip() next(fh_tx) tx_qualstr = next(fh_tx).rstrip() i += 1 except StopIteration: break # Quality check? or user should feed good files # if not (umibc_name and umibc_seq and umibc_qualstr and tx_name and tx_seq and tx_qualstr): # raise Exception('FastQError: Possible Broken Fastq. Check pair-{}.\n'.format(i+1)) # if len(umibc_seq) != len(umibc_qualstr) or len(tx_seq) != len(tx_qualstr): # raise Exception('FastQError: Possible multi-line Fastq. Convert to 4-line please.\n') # if umibc_name.split()[0] != tx_name.split()[0]: # raise Exception('FastQError: Reads are not paired at pair-{}.\n'.format(i+1)) sample_counter['total'] += 1 umibc_idx = sorted( list( set(range(start_umi, start_umi + len_umi)) | set(range(start_bc, start_bc + len_bc)))) if len(umibc_seq) < len(umibc_idx): continue umibc_min_qual = min((ord(umibc_qualstr[i]) - 33 for i in umibc_idx)) if umibc_min_qual < bc_qual_min: continue sample_counter['qualified'] += 1 umi = umibc_seq[start_umi:(start_umi + len_umi)] cell_bc = umibc_seq[start_bc:(start_bc + len_bc)] try: fhout = bc_fhout[cell_bc] except KeyError: if save_unknown_bc_fastq: fhout = bc_fhout['UNKNOWNBC_R1'] fhout.write('{}\n{}\n{}\n{}\n'.format(umibc_name, umibc_seq, "+", umibc_qualstr)) fhout = bc_fhout['UNKNOWNBC_R2'] fhout.write('{}\n{}\n{}\n{}\n'.format(tx_name, tx_seq, "+", tx_qualstr)) sample_counter['unknown'] += 1 continue # if len(tx_seq) < len_tx: # continue if len(tx_seq) > len_tx: tx_seq, tx_qualstr = tx_seq[:len_tx], tx_qualstr[:len_tx] read_name = '@BC-{}_UMI-{}'.format(cell_bc, umi) fhout.write('{}\n{}\n{}\n{}\n'.format(read_name, tx_seq, "+", tx_qualstr)) sample_counter[cell_bc] += 1 sample_counter['saved'] += 1 sample_counter['unqualified'] = sample_counter['total'] - \ sample_counter['qualified'] for _, v in bc_fhout.items(): v.close() fh_umibc.close() fh_tx.close() return (sample_counter)
def _remove_gz_suffix(x): xbasename = base_name(x) xdirname = dir_name(x) return join_path(xdirname, xbasename.replace('.gz', ''))