예제 #1
0
파일: slim.py 프로젝트: yanailab/celseq2
def run_bam2sam(x):
    x_dirname, x_basename = dir_name(x), base_name(x)
    y = join_path(x_dirname, x_basename + '.sam')
    _ = popen_communicate('samtools view -h {} -o {}'.format(x, y))
    if is_nonempty_file(y):
        print_logger('Finished: bam to sam {} to {}'.format(x, y))
        rmfile(x)
    else:
        print_logger('Failed: bam to sam {} to {}'.format(x, y))
예제 #2
0
def demultiplex_sam(samfile, outdir, bc_length):
    if not samfile:
        return
    samobj = pysam.AlignmentFile(samfile, 'rb')

    dict_samout = {}

    for aln in samobj:
        bc = _cell_seq(aln.query_name, length=bc_length)
        fh = dict_samout.get(bc, None)
        if not fh:
            outsam = join_path(outdir, bc + '.sam')
            fh = pysam.AlignmentFile(outsam, 'w', template=samobj)
            dict_samout[bc] = fh

        fh.write(aln)

    for _, fh in dict_samout.items():
        fh.close()
예제 #3
0
파일: slim.py 프로젝트: yanailab/celseq2
def main():
    p = get_argument_parser()
    args = p.parse_args()

    SUBDIR_FASTQ = 'small_fq'
    SUBDIR_ALIGN = 'small_sam'
    fqs = glob.glob(join_path(args.project_dir, SUBDIR_FASTQ, '*', '*.fastq'),
                    recursive=True)
    fqs_unknown = glob.glob(join_path(args.project_dir, SUBDIR_FASTQ, '*',
                                      'UNKNOWN', '*.fq'), recursive=True)
    sams = glob.glob(join_path(args.project_dir, SUBDIR_ALIGN, '*', '*.sam'),
                     recursive=True)

    if args.dryrun:
        print_logger(('{} fastqs are '
                      'to be gzipped. ').format(len(fqs + fqs_unknown)))
        print_logger('{} sams are to be converted to bam'.format(len(sams)))
        return 0

    subdir_fastq_size0 = dirsize_str(join_path(args.project_dir, SUBDIR_FASTQ))
    subdir_align_size0 = dirsize_str(join_path(args.project_dir, SUBDIR_ALIGN))

    p = Pool(args.cores)
    for fq in fqs + fqs_unknown:
        p.apply_async(run_gzip_fastq, args=(fq,))
    for sam in sams:
        p.apply_async(run_sam2bam, args=(sam,))
    p.close()
    p.join()

    subdir_fastq_size1 = dirsize_str(join_path(args.project_dir, SUBDIR_FASTQ))
    subdir_align_size1 = dirsize_str(join_path(args.project_dir, SUBDIR_ALIGN))

    print_logger('Storage of FASTQs: {} => {}'.format(subdir_fastq_size0,
                                                      subdir_fastq_size1))
    print_logger('Storage of Alignments: {} => {}'.format(subdir_align_size0,
                                                          subdir_align_size1))

    return 0
예제 #4
0
def demultiplex_sam_with_claim(samfile, outdir, bc_length, claimed_bc):
    if not samfile:
        return
    if not claimed_bc:
        return

    samobj = pysam.AlignmentFile(samfile, 'rb')

    dict_samout = {}
    for bc in claimed_bc:
        fh = pysam.AlignmentFile(join_path(outdir, bc + '.sam'),
                                 'w',
                                 template=samobj)
        dict_samout[bc] = fh

    for aln in samobj:
        bc = _cell_seq(aln.query_name, length=bc_length)
        fh = dict_samout.get(bc, None)
        if not fh:
            continue
        fh.write(aln)

    for _, fh in dict_samout.items():
        fh.close()
예제 #5
0
def demultiplexing(read1_fpath,
                   read2_fpath,
                   dict_bc_id2seq,
                   outdir,
                   start_umi=0,
                   start_bc=6,
                   len_umi=6,
                   len_bc=6,
                   len_tx=35,
                   bc_qual_min=10,
                   is_gzip=True,
                   save_unknown_bc_fastq=False,
                   tagging_only=False,
                   tag_to='tagged.fastq',
                   do_bc_rev_complement=False,
                   do_tx_rev_complement=False,
                   verbose=False):
    """
    Demultiplexing to fastq files based on barcode sequence.


    """
    if is_gzip:
        fh_umibc = filehandle_fastq_gz(read1_fpath)
        fh_tx = filehandle_fastq_gz(read2_fpath)
    else:
        fh_umibc = open(read1_fpath, 'rt')
        fh_tx = open(read2_fpath, 'rt')

    sample_counter = Counter()

    bc_fhout = dict()
    for bc_id, bc_seq in dict_bc_id2seq.items():
        # bc_id = '[{}]'.format('-'.join(map(str, bc_id)))
        bc_fhout[bc_seq] = join_path(outdir,
                                     'BC-{}-{}.fastq'.format(bc_id, bc_seq))
    mkfolder(join_path(outdir, 'UNKNOWN'))
    bc_fhout['UNKNOWNBC_R1'] = join_path(outdir, 'UNKNOWN', 'UNKNOWNBC_R1.fq')
    bc_fhout['UNKNOWNBC_R2'] = join_path(outdir, 'UNKNOWN', 'UNKNOWNBC_R2.fq')

    if tagging_only:
        out_fpath_tagged_fq = join_path(outdir, tag_to)
        out_fh_tagged_fq = open(out_fpath_tagged_fq, 'w')

    for bc_seq, v in bc_fhout.items():
        if bc_seq.startswith('UNKNOWN'):
            bc_fhout[bc_seq] = open(v, 'w')
            continue
        if tagging_only:
            bc_fhout[bc_seq] = out_fh_tagged_fq
        else:
            bc_fhout[bc_seq] = open(v, 'w')

    i = 0
    while (True):
        if verbose and i % 1000000 == 0:
            print_logger('Processing {:,} reads...'.format(i))
        try:
            umibc_name = next(fh_umibc).rstrip()
            umibc_seq = next(fh_umibc).rstrip()
            next(fh_umibc)
            umibc_qualstr = next(fh_umibc).rstrip()
            tx_name = next(fh_tx).rstrip()
            tx_seq = next(fh_tx).rstrip()
            next(fh_tx)
            tx_qualstr = next(fh_tx).rstrip()
            i += 1
        except StopIteration:
            break

#         Quality check? or user should feed good files
#         if not (umibc_name and umibc_seq and umibc_qualstr and tx_name and tx_seq and tx_qualstr):
#             raise Exception('FastQError: Possible Broken Fastq. Check pair-{}.\n'.format(i+1))
#         if len(umibc_seq) != len(umibc_qualstr) or len(tx_seq) != len(tx_qualstr):
#             raise Exception('FastQError: Possible multi-line Fastq. Convert to 4-line please.\n')
#         if umibc_name.split()[0] != tx_name.split()[0]:
#             raise Exception('FastQError: Reads are not paired at pair-{}.\n'.format(i+1))

        sample_counter['total'] += 1

        umibc_idx = sorted(
            list(
                set(range(start_umi, start_umi + len_umi))
                | set(range(start_bc, start_bc + len_bc))))

        if len(umibc_seq) < len(umibc_idx):
            continue

        umibc_min_qual = min((ord(umibc_qualstr[i]) - 33 for i in umibc_idx))

        if umibc_min_qual < bc_qual_min:
            continue

        sample_counter['qualified'] += 1

        umi = umibc_seq[start_umi:(start_umi + len_umi)]
        cell_bc = umibc_seq[start_bc:(start_bc + len_bc)]
        try:
            fhout = bc_fhout[cell_bc]
        except KeyError:
            if save_unknown_bc_fastq:
                fhout = bc_fhout['UNKNOWNBC_R1']
                fhout.write('{}\n{}\n{}\n{}\n'.format(umibc_name, umibc_seq,
                                                      "+", umibc_qualstr))
                fhout = bc_fhout['UNKNOWNBC_R2']
                fhout.write('{}\n{}\n{}\n{}\n'.format(tx_name, tx_seq, "+",
                                                      tx_qualstr))
            sample_counter['unknown'] += 1
            continue


#         if len(tx_seq) < len_tx:
#             continue
        if len(tx_seq) > len_tx:
            tx_seq, tx_qualstr = tx_seq[:len_tx], tx_qualstr[:len_tx]
        read_name = '@BC-{}_UMI-{}'.format(cell_bc, umi)
        fhout.write('{}\n{}\n{}\n{}\n'.format(read_name, tx_seq, "+",
                                              tx_qualstr))
        sample_counter[cell_bc] += 1
        sample_counter['saved'] += 1

    sample_counter['unqualified'] = sample_counter['total'] - \
        sample_counter['qualified']
    for _, v in bc_fhout.items():
        v.close()
    fh_umibc.close()
    fh_tx.close()

    return (sample_counter)
예제 #6
0
def _remove_gz_suffix(x):
    xbasename = base_name(x)
    xdirname = dir_name(x)
    return join_path(xdirname, xbasename.replace('.gz', ''))