def main(): parser = argparse.ArgumentParser(add_help=True) parser.add_argument( 'fpath', type=str, metavar='FILENAME', help=('file path (CSV/TSV) to the expression file with genes/features ' 'as rows and cells/samples on columns. ' 'First column saves gene names.')) parser.add_argument('saveto', type=str, metavar='FILENAME', help='File path (html) to save the QC plots.') parser.add_argument('--name', type=str, metavar='STR', default='') parser.add_argument('--sep', type=str, default='\t', help='File sep (default: \'\t\')') parser.add_argument('--st', dest='is_st', action='store_true') parser.set_defaults(is_st=False) args = parser.parse_args() if args.is_st: plotly_qc_st(args.fpath, args.saveto, args.sep, args.name) else: plotly_qc(args.fpath, args.saveto, args.sep, args.name) print_logger('Generate QC for {}'.format(args.fpath)) print_logger('See {}'.format(args.saveto))
def cook_anno_model(gff_fpath, feature_atrr='gene_id', feature_type='exon', gene_types=(), stranded=True, dumpto=None, verbose=False): ''' Prepare a feature model. Output: (features, exported_genes) where: - features: HTSeq.GenomicArrayOfSets() - exported_genes: a sorted list For example, feature_atrr = 'gene_name', feature_type = 'exon', gene_types = ('protein_coding', 'lincRNA'): - features: all exons ~ all gnames mapping and ready for counting - exported_genes: only protein_coding and lincRNA gnames are visible Quantification used the full genes but only the selected genes are reported. ''' features = HTSeq.GenomicArrayOfSets("auto", stranded=stranded) fh_gff = HTSeq.GFF_Reader(gff_fpath) exported_genes = set() i = 0 for gff in fh_gff: if verbose and i % 100000 == 0: print_logger('Processing {:,} lines of GFF...'.format(i)) i += 1 if gff.type != feature_type: continue features[gff.iv] += gff.attr[feature_atrr].strip() if not feature_atrr.startswith('gene'): exported_genes.add(gff.attr[feature_atrr].strip()) continue if not gene_types: exported_genes.add(gff.attr[feature_atrr].strip()) continue if gff.attr.get('gene_biotype', None) in gene_types: exported_genes.add(gff.attr[feature_atrr].strip()) print_logger('Processed {:,} lines of GFF...'.format(i)) # Use genometools to select exported_genes # if gene_types: # exported_genes = get_genes(gff_fpath, valid_biotypes=set(gene_types)) # exported_genes = list(exported_genes['name'].values) if exported_genes: exported_genes = tuple(sorted(exported_genes)) if dumpto: with open(dumpto, 'wb') as fh: pickle.dump((features, exported_genes), fh) return ((features, exported_genes))
def run_bam2sam(x): x_dirname, x_basename = dir_name(x), base_name(x) y = join_path(x_dirname, x_basename + '.sam') _ = popen_communicate('samtools view -h {} -o {}'.format(x, y)) if is_nonempty_file(y): print_logger('Finished: bam to sam {} to {}'.format(x, y)) rmfile(x) else: print_logger('Failed: bam to sam {} to {}'.format(x, y))
def main(): parser = argparse.ArgumentParser(add_help=True) parser.add_argument('--sbam', type=str, metavar='FILENAME', help='File path to SAM/BAM file') parser.add_argument('--savetodir', type=str, metavar='DIRNAME', help='Directory path to save the demultiplexed SAMs.', default='.') parser.add_argument('--bc-length', type=int, metavar='N', help='Length of cell barcode.', default=6) parser.add_argument('--claim', action='store_true', dest='claim') parser.set_defaults(claim=False) parser.add_argument('--bc-index', type=str, metavar='FILENAME', help='File path to barcode dictionary.') parser.add_argument('--bc-seq-column', type=int, metavar='N', default=0, help=('Column of cell barcode dictionary file ' 'which tells the actual sequences.')) parser.add_argument('--bc-index-used', type=str, metavar='string', default='1-96', help='Index of used barcode IDs (default=1-96)') args = parser.parse_args() print_logger('Demultiplexing SAM/BAM starts {} ...'.format(args.sbam)) if args.claim: all_bc_dict = bc_dict_id2seq(args.bc_index, args.bc_seq_column) bc_index_used = str2int(args.bc_index_used) bc_seq_used = [all_bc_dict.get(x, None) for x in bc_index_used] demultiplex_sam_with_claim(samfile=args.sbam, outdir=args.savetodir, bc_length=args.bc_length, claimed_bc=bc_seq_used) else: demultiplex_sam(samfile=args.sbam, outdir=args.savetodir, bc_length=args.bc_length) print_logger('Demultiplexing SAM/BAM ends. See: {}'.format(args.savetodir))
def main(): p = get_argument_parser() args = p.parse_args() SUBDIR_FASTQ = 'small_fq' SUBDIR_ALIGN = 'small_sam' fqs = glob.glob(join_path(args.project_dir, SUBDIR_FASTQ, '*', '*.fastq'), recursive=True) fqs_unknown = glob.glob(join_path(args.project_dir, SUBDIR_FASTQ, '*', 'UNKNOWN', '*.fq'), recursive=True) sams = glob.glob(join_path(args.project_dir, SUBDIR_ALIGN, '*', '*.sam'), recursive=True) if args.dryrun: print_logger(('{} fastqs are ' 'to be gzipped. ').format(len(fqs + fqs_unknown))) print_logger('{} sams are to be converted to bam'.format(len(sams))) return 0 subdir_fastq_size0 = dirsize_str(join_path(args.project_dir, SUBDIR_FASTQ)) subdir_align_size0 = dirsize_str(join_path(args.project_dir, SUBDIR_ALIGN)) p = Pool(args.cores) for fq in fqs + fqs_unknown: p.apply_async(run_gzip_fastq, args=(fq,)) for sam in sams: p.apply_async(run_sam2bam, args=(sam,)) p.close() p.join() subdir_fastq_size1 = dirsize_str(join_path(args.project_dir, SUBDIR_FASTQ)) subdir_align_size1 = dirsize_str(join_path(args.project_dir, SUBDIR_ALIGN)) print_logger('Storage of FASTQs: {} => {}'.format(subdir_fastq_size0, subdir_fastq_size1)) print_logger('Storage of Alignments: {} => {}'.format(subdir_align_size0, subdir_align_size1)) return 0
def demultiplexing(read1_fpath, read2_fpath, dict_bc_id2seq, outdir, start_umi=0, start_bc=6, len_umi=6, len_bc=6, len_tx=35, bc_qual_min=10, is_gzip=True, save_unknown_bc_fastq=False, tagging_only=False, tag_to='tagged.fastq', do_bc_rev_complement=False, do_tx_rev_complement=False, verbose=False): """ Demultiplexing to fastq files based on barcode sequence. """ if is_gzip: fh_umibc = filehandle_fastq_gz(read1_fpath) fh_tx = filehandle_fastq_gz(read2_fpath) else: fh_umibc = open(read1_fpath, 'rt') fh_tx = open(read2_fpath, 'rt') sample_counter = Counter() bc_fhout = dict() for bc_id, bc_seq in dict_bc_id2seq.items(): # bc_id = '[{}]'.format('-'.join(map(str, bc_id))) bc_fhout[bc_seq] = join_path(outdir, 'BC-{}-{}.fastq'.format(bc_id, bc_seq)) mkfolder(join_path(outdir, 'UNKNOWN')) bc_fhout['UNKNOWNBC_R1'] = join_path(outdir, 'UNKNOWN', 'UNKNOWNBC_R1.fq') bc_fhout['UNKNOWNBC_R2'] = join_path(outdir, 'UNKNOWN', 'UNKNOWNBC_R2.fq') if tagging_only: out_fpath_tagged_fq = join_path(outdir, tag_to) out_fh_tagged_fq = open(out_fpath_tagged_fq, 'w') for bc_seq, v in bc_fhout.items(): if bc_seq.startswith('UNKNOWN'): bc_fhout[bc_seq] = open(v, 'w') continue if tagging_only: bc_fhout[bc_seq] = out_fh_tagged_fq else: bc_fhout[bc_seq] = open(v, 'w') i = 0 while (True): if verbose and i % 1000000 == 0: print_logger('Processing {:,} reads...'.format(i)) try: umibc_name = next(fh_umibc).rstrip() umibc_seq = next(fh_umibc).rstrip() next(fh_umibc) umibc_qualstr = next(fh_umibc).rstrip() tx_name = next(fh_tx).rstrip() tx_seq = next(fh_tx).rstrip() next(fh_tx) tx_qualstr = next(fh_tx).rstrip() i += 1 except StopIteration: break # Quality check? or user should feed good files # if not (umibc_name and umibc_seq and umibc_qualstr and tx_name and tx_seq and tx_qualstr): # raise Exception('FastQError: Possible Broken Fastq. Check pair-{}.\n'.format(i+1)) # if len(umibc_seq) != len(umibc_qualstr) or len(tx_seq) != len(tx_qualstr): # raise Exception('FastQError: Possible multi-line Fastq. Convert to 4-line please.\n') # if umibc_name.split()[0] != tx_name.split()[0]: # raise Exception('FastQError: Reads are not paired at pair-{}.\n'.format(i+1)) sample_counter['total'] += 1 umibc_idx = sorted( list( set(range(start_umi, start_umi + len_umi)) | set(range(start_bc, start_bc + len_bc)))) if len(umibc_seq) < len(umibc_idx): continue umibc_min_qual = min((ord(umibc_qualstr[i]) - 33 for i in umibc_idx)) if umibc_min_qual < bc_qual_min: continue sample_counter['qualified'] += 1 umi = umibc_seq[start_umi:(start_umi + len_umi)] cell_bc = umibc_seq[start_bc:(start_bc + len_bc)] try: fhout = bc_fhout[cell_bc] except KeyError: if save_unknown_bc_fastq: fhout = bc_fhout['UNKNOWNBC_R1'] fhout.write('{}\n{}\n{}\n{}\n'.format(umibc_name, umibc_seq, "+", umibc_qualstr)) fhout = bc_fhout['UNKNOWNBC_R2'] fhout.write('{}\n{}\n{}\n{}\n'.format(tx_name, tx_seq, "+", tx_qualstr)) sample_counter['unknown'] += 1 continue # if len(tx_seq) < len_tx: # continue if len(tx_seq) > len_tx: tx_seq, tx_qualstr = tx_seq[:len_tx], tx_qualstr[:len_tx] read_name = '@BC-{}_UMI-{}'.format(cell_bc, umi) fhout.write('{}\n{}\n{}\n{}\n'.format(read_name, tx_seq, "+", tx_qualstr)) sample_counter[cell_bc] += 1 sample_counter['saved'] += 1 sample_counter['unqualified'] = sample_counter['total'] - \ sample_counter['qualified'] for _, v in bc_fhout.items(): v.close() fh_umibc.close() fh_tx.close() return (sample_counter)
def main(): parser = argparse.ArgumentParser(add_help=True) parser.add_argument('read1_fpath', type=str) parser.add_argument('read2_fpath', type=str) parser.add_argument('--bc-index', type=str, metavar='FILENAME', help='File path to barcode dictionary') parser.add_argument('--bc-seq-column', type=int, metavar='N', default=0, help=('Column of cell barcode dictionary file ' 'which tells the actual sequences.')) parser.add_argument('--bc-index-used', type=str, metavar='string', default='1-96', help='Index of used barcode IDs (default=1-96)') parser.add_argument('--min-bc-quality', metavar='N', type=int, default=10, help='Minimal quality for barcode reads (default=10)') parser.add_argument('--out-dir', metavar='DIRNAME', type=str, default='.', help='Output directory. Defaults to current directory') parser.add_argument('--is-gzip', dest='is_gzip', action='store_true') parser.add_argument('--not-gzip', dest='is_gzip', action='store_false') parser.set_defaults(is_gzip=True) parser.add_argument('--stats-file', metavar='STATFILE', type=str, default='demultiplexing.log', help='Statistics (default: demultiplexing.log)') parser.add_argument('--umi-start-position', metavar='N', type=int, default=0, help=('Start index of UMI on R1. ' 'Default: 0. (0-based).')) parser.add_argument('--umi-length', metavar='N', type=int, default=6, help='Length of UMI (default=6)') parser.add_argument('--bc-start-position', metavar='N', type=int, default=6, help=('Start index of cell barcode on R1. ' 'Default: 6. (0-based).')) parser.add_argument('--bc-length', metavar='N', type=int, default=6, help='Length of CELSeq barcode (default=6)') parser.add_argument('--cut-length', metavar='N', type=int, default=35, help='Length of read on R2 to be mapped. (default=35)') parser.add_argument('--save-unknown-bc-fastq', dest='save_unknown_bc_fastq', action='store_true') parser.set_defaults(save_unknown_bc_fastq=False) parser.add_argument('--tagging-only', dest='tagging_only', action='store_true', help=('Demultiplexed reads are merged to a file named' ' \"tagged.fastq\" under --out-dir.')) parser.set_defaults(tagging_only=False) parser.add_argument('--tag-to', dest='tag_to', default='tagged.fastq', help=('File base name to save the tagged fastq file. ' 'Only used when tagging_only.')) parser.add_argument('--verbose', dest='verbose', action='store_true') parser.set_defaults(verbose=False) args = parser.parse_args() bc_dict = bc_dict_id2seq(args.bc_index, args.bc_seq_column) bc_index_used = str2int(args.bc_index_used) bc_dict = {x: bc_dict.get(x, None) for x in bc_index_used} print_logger('Demultiplexing starts {}--{} ...'.format( args.read1_fpath, args.read2_fpath)) out = demultiplexing(read1_fpath=args.read1_fpath, read2_fpath=args.read2_fpath, outdir=args.out_dir, dict_bc_id2seq=bc_dict, start_umi=args.umi_start_position, start_bc=args.bc_start_position, len_umi=args.umi_length, len_bc=args.bc_length, len_tx=args.cut_length, bc_qual_min=args.min_bc_quality, is_gzip=args.is_gzip, save_unknown_bc_fastq=args.save_unknown_bc_fastq, tagging_only=args.tagging_only, tag_to=args.tag_to, do_bc_rev_complement=False, do_tx_rev_complement=False, verbose=args.verbose) print_logger('Demultiplexing ends {}--{}.'.format(args.read1_fpath, args.read2_fpath)) write_demultiplexing(out, bc_dict, args.stats_file)
def plotly_qc(fpath, saveto, sep=',', name=''): ''' Generate a plotly html plot for QC of a scRNA-seq data. QC inlucdes: - number of total UMIs - number of detected genes - percent of MT expression Input: fpath: file path (CSV/TSV) to the expression file with genes/features as rows and cells/samples on columns. First column saves gene names. saveto: a html file to save the plots using Plot.ly sep: file sep. Default: "," ''' bool_success = False if not is_nonempty_file(fpath): return bool_success if not name: name = base_name(fpath) expr = pd.read_csv(fpath, index_col=0, sep=sep) print_logger(('UMI count matrix: ' '{} genes x {} cells').format(expr.shape[0], expr.shape[1])) total_num_UMIs = expr.sum(axis=0) num_detected_genes = (expr > 0).sum(axis=0) mt_index = [x for x in expr.index if x.startswith( 'mt-') or x.startswith('MT-')] if not mt_index: percent_mt = 0 else: mt_umis = expr.loc[pd.Series(mt_index), :].sum(axis=0) percent_mt = mt_umis / total_num_UMIs percent_mt = percent_mt.replace(np.inf, 0) qc = pd.DataFrame(dict(total_num_UMIs=total_num_UMIs, num_detected_genes=num_detected_genes, percent_mt=percent_mt)) # 1/5 plotly_g_vs_umi = plotly_scatter( x=qc.total_num_UMIs, y=qc.num_detected_genes, xlab='#Total UMIs (median={})'.format(qc.total_num_UMIs.median()), ylab='#Detected Genes (median={})'.format( qc.num_detected_genes.median()), main=name, hover_text=qc.index.values) plotly_g_vs_umi.layout.yaxis.scaleanchor = None # 2/5 plotly_mt_vs_umi = plotly_scatter( x=qc.total_num_UMIs, y=qc.percent_mt, xlab='#Total UMIs (median={})'.format(qc.total_num_UMIs.median()), ylab='MT Fraction (median={:6.4f})'.format(qc.percent_mt.median()), main=name, hover_text=qc.index.values) plotly_mt_vs_umi.layout.yaxis.scaleanchor = None # 3/5 plotly_hist_umis = plotly_hist( vals=qc.total_num_UMIs, xlab='#Total UMIs (median={})'.format(qc.total_num_UMIs.median())) # 4/5 plotly_hist_g = plotly_hist( vals=qc.num_detected_genes, xlab=('#Detected Genes ' '(median={})').format(qc.num_detected_genes.median())) # 5/5 plotly_hist_percent_mt = plotly_hist( vals=qc.percent_mt, xlab='MT Fraction (median={:6.4f})'.format(qc.percent_mt.median())) # Merge the 5 figures together qc_fig = tools.make_subplots( rows=2, cols=3, specs=[[{}, {}, None], [{}, {}, {}]]) qc_fig.append_trace(plotly_g_vs_umi.data[0], 1, 1) qc_fig.append_trace(plotly_mt_vs_umi.data[0], 1, 2) qc_fig.append_trace(plotly_hist_umis.data[0], 2, 1) qc_fig.append_trace(plotly_hist_g.data[0], 2, 2) qc_fig.append_trace(plotly_hist_percent_mt.data[0], 2, 3) qc_fig.layout.xaxis1 = {**qc_fig.layout.xaxis1, **plotly_g_vs_umi.layout.xaxis} qc_fig.layout.yaxis1 = {**qc_fig.layout.yaxis1, **plotly_g_vs_umi.layout.yaxis} qc_fig.layout.xaxis2 = {**qc_fig.layout.xaxis2, **plotly_mt_vs_umi.layout.xaxis} qc_fig.layout.yaxis2 = {**qc_fig.layout.yaxis2, **plotly_mt_vs_umi.layout.yaxis} qc_fig.layout.xaxis3 = {**qc_fig.layout.xaxis3, **plotly_hist_umis.layout.xaxis} qc_fig.layout.yaxis3 = {**qc_fig.layout.yaxis3, **plotly_hist_umis.layout.yaxis} qc_fig.layout.xaxis4 = {**qc_fig.layout.xaxis4, **plotly_hist_g.layout.xaxis} qc_fig.layout.yaxis4 = {**qc_fig.layout.yaxis4, **plotly_hist_g.layout.yaxis} qc_fig.layout.xaxis5 = {**qc_fig.layout.xaxis5, **plotly_hist_percent_mt.layout.xaxis} qc_fig.layout.yaxis5 = {**qc_fig.layout.yaxis5, **plotly_hist_percent_mt.layout.yaxis} qc_fig['layout'].update(height=800, width=1000, title=name, showlegend=False) plot(qc_fig, filename=saveto, auto_open=False) bool_success = True return bool_success
def plotly_qc_st(fpath, saveto, sep='\t', name=''): bool_success = False if not is_nonempty_file(fpath): return bool_success if not name: name = base_name(fpath) ST = pd.read_csv(fpath, sep=sep, index_col=0) print_logger(('ST UMI-count matrix has ' '{} spots x {} genes').format(ST.shape[0], ST.shape[1])) ST_total_UMIs = ST.sum(axis=1) ST_detected_genes = (ST > 0).sum(axis=1) mt_cols = [x for x in ST.columns if x.startswith( 'mt-') or x.startswith('MT-')] if not mt_cols: ST_percent_mt = 0 else: ST_percent_mt = ST[mt_cols].sum(axis=1) / ST_total_UMIs # ST_percent_mt = ST_percent_mt.replace(np.inf, 0) # ST_percent_mt = ST_percent_mt.replace(np.NaN, 0) st_xy = np.array(list(map(lambda xy: xy.strip().split('x'), ST.index))) st_x = st_xy[:, 0].astype(np.int) st_y = st_xy[:, 1].astype(np.int) ST_qc = pd.DataFrame( dict(Row=st_x, Col=st_y, total_num_UMIs=ST_total_UMIs, num_detected_genes=ST_detected_genes, percent_mt=ST_percent_mt)) # 1/3 plotly_ST_g = plotly_scatter( x=ST_qc.Row, y=ST_qc.Col, mask_by=ST_qc.num_detected_genes, hover_text=ST_qc.num_detected_genes.astype('str'), colorscale='Viridis', mask_title=('#Detected Genes ' '(median={})').format(ST_qc.num_detected_genes.median())) # 2/3 plotly_ST_UMIs = plotly_scatter( x=ST_qc.Row, y=ST_qc.Col, mask_by=ST_qc.total_num_UMIs, hover_text=ST_qc.total_num_UMIs.astype('str'), colorscale='Viridis', mask_title=('#Total UMIs ' '(median={})').format(ST_qc.total_num_UMIs.median())) # 3/3 plotly_ST_mt = plotly_scatter( x=ST_qc.Row, y=ST_qc.Col, mask_by=ST_qc.percent_mt, hover_text=ST_qc.percent_mt.astype('str'), colorscale='Viridis', mask_title=('MT Fraction ' '(median={:6.4f})').format(ST_qc.percent_mt.median())) # Merge the 3 figures together fig = tools.make_subplots( rows=1, cols=3, subplot_titles=('#Total UMIs', '#Detected Genes', 'MT Fraction')) fig.append_trace(plotly_ST_UMIs.data[0], 1, 1) fig.append_trace(plotly_ST_g.data[0], 1, 2) fig.append_trace(plotly_ST_mt.data[0], 1, 3) fig['layout'].update(height=600, width=1900, title=name) fig.layout.showlegend = False # Manually change the locations of other two color bars to proper places fig.data[0].marker.colorbar.x = 0.28 fig.data[1].marker.colorbar.x = 0.64 plot(fig, filename=saveto, auto_open=False) bool_success = True return bool_success
def run_gunzip_fastq(x): _ = popen_communicate('gzip -f -d {}'.format(x)) print_logger('Finished: gzip -f -d {}'.format(x))