def main(args, outs): martian.log_info(u'Logging à non-ascii character from unicode string.') martian.log_info(u'Logging à non-ascii character from python string.'.encode('utf-8')) martian.log_info({'hello': 'world'}) martian.update_progress(u'%s = %f' % ( u'+'.join([u'%f\u00b2' % v for v in args.values]), args.sum))
def prepare_transcriptome_indexes(reference_path, vdj_reference_path): """ Use ReadStates of R1/R2 to determine SC3Pv1 vs SC3Pv2 vs SC5P-R1 vs SC5P_auto/SCVDJ. Returns (chemistry_name, report, metrics) where report is a text report and metrics is a dict """ ## Index the reference fasta fa_path = os.path.join(reference_path, cr_constants.REFERENCE_FASTA_PATH) new_fa_path = martian.make_path('ref.fa') need_index = True if os.path.exists(fa_path + '.fai'): # Look for existing .fai file (won't exist for our standard ref packages) martian.update_progress('Found genome FASTA index....') new_fa_path = fa_path need_index = False else: # Note: this will fail if user's fs doesn't support symlinks martian.update_progress('Symlinking genome FASTA...') os.symlink(fa_path, new_fa_path) if need_index: martian.update_progress('Indexing genome...') run(['samtools', 'faidx', new_fa_path]) ## Generate a transcriptome reference from a genome ref martian.update_progress('Building transcriptome...') gtf_path = os.path.join(reference_path, cr_constants.REFERENCE_GENES_GTF_PATH) out_fa_path = martian.make_path('transcriptome.fa') # Only index the 1st encountered transcript per gene run([ 'detect_chemistry', 'get-transcripts', new_fa_path, gtf_path, out_fa_path ]) ## Build kmer index martian.update_progress('Building kmer index...') kmer_idx_path = martian.make_path('kmers.idx') run(['detect_chemistry', 'index-transcripts', out_fa_path, kmer_idx_path]) # Build VDJ kmer index (optional) vdj_idx_path = None if vdj_reference_path is not None: vdj_fa_path = vdj_ref.get_vdj_reference_fasta(vdj_reference_path) vdj_idx_path = martian.make_path('vdj_kmers.idx') run([ 'detect_chemistry', 'index-transcripts', vdj_fa_path, vdj_idx_path ]) return (kmer_idx_path, vdj_idx_path)
def infer_sc3p_or_sc5p(chunks, kmer_idx_path, vdj_idx_path): """ Use ReadStates of R1/R2 to determine SC3Pv1 vs SC3Pv2 vs SC5P-R1 vs SC5P_auto/SCVDJ. Returns (chemistry_name, report, metrics) where report is a text report and metrics is a dict """ ## Map read kmers to each strand martian.update_progress('Mapping reads...') # Prepare fastq paths fq_specs = {} for read_type in READ_TYPES: fq_specs[read_type] = martian.make_path('%s_in.json' % read_type) with open(fq_specs[read_type], 'w') as f: json.dump([c for c in chunks if c['read_type'] == read_type], f) # Map reads to gene expression reference metrics = {} for read_type in READ_TYPES: map_out_path = martian.make_path('%s_out.json' % read_type) metrics[read_type] = map_reads(fq_specs[read_type], kmer_idx_path, map_out_path) # Map reads to VDJ reference (optional) if vdj_idx_path is not None: for read_type in READ_TYPES: map_out_path = martian.make_path('vdj_%s_out.json' % read_type) vdj_metrics = map_reads(fq_specs[read_type], vdj_idx_path, map_out_path) for k, v in vdj_metrics.iteritems(): metrics[read_type]['vdj_' + k] = v # Verify total read counts r1_total = metrics['R1']['total_reads'] r2_total = metrics['R2']['total_reads'] if r1_total != 0 and r2_total != 0 and r1_total != r2_total: martian.exit( 'Total read counts for R1 and R2 must be identical if both are present. There were %d R1 reads and %d R2 reads. Check that all of the FASTQ files are present.' % (r1_total, r2_total)) ## Infer chemistry report = '\n' for read_type, m in metrics.iteritems(): report += '%s Total Reads: %s\n' % (read_type, str( m['total_reads']).rjust(20)) report += '%s Sense Reads: %s\n' % (read_type, str( m['sense_reads']).rjust(20)) report += '%s Antisense Reads: %s\n' % ( read_type, str(m['antisense_reads']).rjust(20)) if vdj_idx_path is not None: for read_type, m in metrics.iteritems(): report += '%s Sense V(D)J Reads: %s\n' % ( read_type, str(m['vdj_sense_reads']).rjust(20)) report += '%s Antisense V(D)J Reads: %s\n' % ( read_type, str(m['vdj_antisense_reads']).rjust(20)) r1_state = getReadState(metrics['R1']['sense_reads'], metrics['R1']['antisense_reads'], metrics['R1']['mapped_reads'], metrics['R1']['total_reads']) r2_state = getReadState(metrics['R2']['sense_reads'], metrics['R2']['antisense_reads'], metrics['R2']['mapped_reads'], metrics['R2']['total_reads']) report += "\n" chemistry_name = None if (r1_state == ReadState.SENSE_MAPPED) and (r2_state == ReadState.UNMAPPED): chemistry_name = 'SC3Pv1' report += "This library is likely to be a Single Cell 3' gene expression library (v1)." elif r2_state == ReadState.SENSE_MAPPED: chemistry_name = 'SC3Pv2' report += "This library is likely to be a Single Cell 3' gene expression library (v2)." elif (r1_state == ReadState.SENSE_MAPPED) and (r2_state == ReadState.ABSENT): chemistry_name = 'SC5P-R1' report += "This library is likely to be a Single Cell 5' gene expression library (R1)." elif (r2_state == ReadState.ANTISENSE_MAPPED): r1_gex_sense_count = metrics['R1'].get('sense_reads', 0) r2_gex_anti_count = metrics['R2'].get('antisense_reads', 0) r1_vdj_sense_count = metrics['R1'].get('vdj_sense_reads', 0) r2_vdj_anti_count = metrics['R2'].get('vdj_antisense_reads', 0) if vdj_idx_path is None: report += "This library is likely to be a Single Cell V(D)J or Single Cell 5' gene expression library." chemistry_name = 'SC5P_auto' else: if (is_vdj(r1_gex_sense_count, r1_vdj_sense_count) or is_vdj(r2_gex_anti_count, r2_vdj_anti_count)): report += "This library is likely to be a Single Cell V(D)J library." chemistry_name = 'SCVDJ' else: report += "This library is likely to be a Single Cell 5' gene expression library." chemistry_name = 'SC5P_auto' else: report += "There was not enough information to determine the nature of the library." chemistry_name = None return chemistry_name, report, metrics
def prepare_transcriptome_indexes(reference_path, vdj_reference_path): """ Use ReadStates of R1/R2 to determine SC3Pv1 vs SC3Pv2 vs SC5P-R1 vs SC5P_auto/SCVDJ. Returns (chemistry_name, report, metrics) where report is a text report and metrics is a dict """ ## Index the reference fasta fa_path = os.path.join(reference_path, cr_constants.REFERENCE_FASTA_PATH) new_fa_path = martian.make_path('ref.fa') need_index = True if os.path.exists(fa_path + '.fai'): # Look for existing .fai file (won't exist for our standard ref packages) martian.update_progress('Found genome FASTA index....') new_fa_path = fa_path need_index = False else: # Note: this will fail if user's fs doesn't support symlinks martian.update_progress('Symlinking genome FASTA...') os.symlink(fa_path, new_fa_path) if need_index: martian.update_progress('Indexing genome...') run(['samtools', 'faidx', new_fa_path]) ## Generate a transcriptome reference from a genome ref martian.update_progress('Building transcriptome...') gtf_path = os.path.join(reference_path, cr_constants.REFERENCE_GENES_GTF_PATH) out_fa_path = martian.make_path('transcriptome.fa') # Only index the 1st encountered transcript per gene run([ 'detect_chemistry', 'get-transcripts', new_fa_path, gtf_path, out_fa_path ]) ## Build kmer index martian.update_progress('Building kmer index...') kmer_idx_path = martian.make_path('kmers.idx') ## Use a larger step size as the reference grows. ## This ensure the index size stays sane. ## Should get to a step of <10 for the whole genome, which ## is still 3x overlap w/ 32-mers fa_size = os.path.getsize(os.path.realpath(out_fa_path)) step = fa_size / 400000000 skip = step - 1 index_args = ['detect_chemistry', 'index-transcripts'] if skip > 0: index_args.append('--skip=%d' % skip) index_args.extend([out_fa_path, kmer_idx_path]) run(index_args) # Build VDJ kmer index (optional) vdj_idx_path = None if vdj_reference_path is not None: vdj_fa_path = vdj_ref.get_vdj_reference_fasta(vdj_reference_path) vdj_idx_path = martian.make_path('vdj_kmers.idx') run([ 'detect_chemistry', 'index-transcripts', vdj_fa_path, vdj_idx_path ]) return (kmer_idx_path, vdj_idx_path)
def main(args, outs): if not args.sum is None: martian.update_progress('%s = %f' % ('+'.join(['%f^2' % v for v in args.values]), args.sum))