def create_unaligned_bam(args, outs): star_ref_path = cr_utils.get_reference_star_path(args.reference_path) header_buf = cStringIO.StringIO() header_buf.write('@HD\tVN:1.4\n') # SQ header lines with open(os.path.join(star_ref_path, 'chrNameLength.txt')) as f: for line in f: chr_name, chr_len = line.strip().split('\t') header_buf.write('@SQ\tSN:{}\tLN:{}\n'.format(chr_name, chr_len)) # RG header lines for packed_rg in args.read_groups: header_buf.write( re.sub('\\\\t', '\t', tk_bam.make_rg_header(packed_rg)) + '\n') # Get read group ID for this chunk of reads read_group = args.read_group # pysam doesn't support reading SAM from a StringIO object with open('tmphdr', 'w') as f: f.write(header_buf.getvalue()) samfile = pysam.AlignmentFile('tmphdr', 'r', check_sq=False) outbam = pysam.AlignmentFile(outs.genome_output, 'wb', template=samfile) fastq_file1 = cr_io.open_maybe_gzip(args.read_chunk) fastq_file2 = cr_io.open_maybe_gzip( args.read2_chunk) if args.read2_chunk else None read1s = tk_fasta.read_generator_fastq(fastq_file1) read2s = tk_fasta.read_generator_fastq(fastq_file2) if fastq_file2 else [] record = pysam.AlignedSegment() record.flag = 4 for read1, read2 in itertools.izip_longest(read1s, read2s): name, seq, qual = read1 record.query_name, record.query_sequence = name.split(' ')[0], seq record.query_qualities = tk_fasta.get_qvs(qual) record.set_tag('RG', read_group, 'Z') outbam.write(record) if read2: name, seq, qual = read2 record.query_name, record.query_sequence = name.split(' ')[0], seq record.query_qualities = tk_fasta.get_qvs(qual) record.set_tag('RG', read_group, 'Z') outbam.write(record) samfile.close() fastq_file1.close() if fastq_file2 is not None: fastq_file2.close() outbam.close()
def main(args, outs): reference_star_path = cr_utils.get_reference_star_path(args.reference_path) star = cr_reference.STAR(reference_star_path) star.align(args.read_chunk, args.read2_chunk, outs.genome_output, max_report_alignments_per_read=args.max_hits_per_read, threads=args.threads, read_group_tags=tk_bam.make_star_rg_header(args.read_group))
def main(args, outs): reference_star_path = cr_utils.get_reference_star_path(args.reference_path) star_index = cr_transcriptome.build_star_index(reference_star_path) chroms = star_index[0][0] gene_index = cr_reference.GeneIndex.load_pickle(cr_utils.get_reference_genes_index(args.reference_path)) barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist) barcode_dist = cr_utils.load_barcode_dist(args.barcode_counts, barcode_whitelist, args.gem_group) reporter = cr_report.Reporter(reference_path=args.reference_path, high_conf_mapq=cr_constants.STAR_DEFAULT_HIGH_CONF_MAPQ, gene_index=gene_index, chroms=chroms, barcode_whitelist=barcode_whitelist, barcode_dist=barcode_dist, gem_groups=args.gem_groups, umi_length=cr_chem.get_umi_length(args.chemistry_def), umi_min_qual_threshold=args.umi_min_qual_threshold) reporter.attach_bcs_init() outs.num_alignments = process_alignments(args.chunk_genome_input, args.chunk_trimmed_input, outs.output, args.bam_comments, reporter, gene_index, star_index, args) reporter.attach_bcs_finalize() reporter.save(outs.chunked_reporter)