def count_reads_in_features( sam_filename, gff_filename, order, max_buffer_size, stranded, overlap_mode, multimapped_mode, secondary_alignment_mode, supplementary_alignment_mode, feature_type, id_attribute, additional_attributes, quiet, minaqual, samout, samout_format, output_delimiter, output_filename, cb_tag, ub_tag, ): '''Count reads in features, parallelizing by file''' if samout is not None: # Try to open samout file early in case any of them has issues if samout_format in ('SAM', 'sam'): with open(samout, 'w'): pass else: # We don't have a template if the input is stdin if sam_filename != '-': with pysam.AlignmentFile(sam_filename, 'r') as sf: with pysam.AlignmentFile(samout, 'w', template=sf): pass # Try to open samfiles to fail early in case any of them is not there if sam_filename != '-': with pysam.AlignmentFile(sam_filename, 'r') as sf: pass # Prepare features gff = HTSeq.GFF_Reader(gff_filename) feature_scan = HTSeq.make_feature_genomicarrayofsets( gff, id_attribute, feature_type=feature_type, additional_attributes=additional_attributes, stranded=stranded != 'no', verbose=not quiet, ) features = feature_scan['features'] attributes = feature_scan['attributes'] feature_attr = sorted(attributes.keys()) if len(feature_attr) == 0: sys.stderr.write("Warning: No features of type '%s' found.\n" % feature_type) # Count reads results = count_reads_with_barcodes( sam_filename, features, feature_attr, order, max_buffer_size, stranded, overlap_mode, multimapped_mode, secondary_alignment_mode, supplementary_alignment_mode, feature_type, id_attribute, additional_attributes, quiet, minaqual, samout_format, samout, cb_tag, ub_tag, ) # Cell barcodes cbs = results['cell_barcodes'] counts = results['counts'] # Write output other_features = [ '__no_feature', '__ambiguous', '__too_low_aQual', '__not_aligned', '__alignment_not_unique', ] pad = ['' for attr in additional_attributes] # Header fields = [''] + pad + cbs line = output_delimiter.join(fields) if output_filename == '': print(line) else: with open(output_filename, 'w') as f: f.write(line) f.write('\n') # Features for ifn, fn in enumerate(feature_attr): fields = [fn] + attributes[fn] + [str(counts[cb][fn]) for cb in cbs] line = output_delimiter.join(fields) if output_filename == '': print(line) else: with open(output_filename, 'a') as f: f.write(line) f.write('\n') # Other features for fn in other_features: fields = [fn] + pad + [str(counts[cb][fn]) for cb in cbs] line = output_delimiter.join(fields) if output_filename == '': print(line) else: with open(output_filename, 'a') as f: f.write(line) f.write('\n')
def count_reads_in_features(args): """Count reads in features, parallelizing by file Args: args: ArgumentParser args, i.e. each argument is a property of this instance. Check the CLI parsing function below for a full list of properties, i.e. command-line options. This function can be conceptually split into the following steps: 1. Load features from GTF file into memory 2. Parse the reads from each BAM file in parallel or series 3. Write output table Step 2 can be further split into two main components: 1. Find what features overlap with each read/read pair 2. Assign that read/pair to a feature (if unique) or a corner case (e.g. multimappers) """ # Load feature GenomicArrayOfSets to mark overlaps gff = HTSeq.GFF_Reader(args.featuresfilename) feature_scan = HTSeq.make_feature_genomicarrayofsets( gff, args.idattr, feature_type=args.feature_type, feature_query=args.feature_query, additional_attributes=args.additional_attributes, stranded=args.stranded != "no", verbose=not args.quiet, add_chromosome_info=args.add_chromosome_info, ) features = feature_scan["features"] attributes = feature_scan["attributes"] feature_attr = sorted(attributes.keys()) if len(feature_attr) == 0: sys.stderr.write( "Warning: No features of type '{args.feature_type}' found.\n") # Count reads in parallel or in series count_args, attributes = _prepare_args_for_counting( features, feature_attr, attributes, args.add_chromosome_info, args.additional_attributes, args.feature_query, args.feature_type, args.featuresfilename, args.idattr, args.max_buffer_size, args.minaqual, args.nonunique, args.order, args.mode, args.quiet, args.samfilenames, args.samout_format, args.samouts, args.secondary_alignments, args.stranded, args.supplementary_alignments, ) if args.nprocesses > 1: with multiprocessing.Pool(args.nprocesses) as pool: results = pool.starmap(count_reads_single_file, count_args) results.sort(key=operator.itemgetter("isam")) else: results = list(itertools.starmap(count_reads_single_file, count_args)) # Merge and write output _write_output( results, args.samfilenames, attributes, args.additional_attributes, args.output_filename, args.output_delimiter, args.output_append, sparse=args.counts_output_sparse, dtype=np.float32, )
def count_reads_in_features( sam_filenames, gff_filename, order, max_buffer_size, stranded, overlap_mode, multimapped_mode, secondary_alignment_mode, supplementary_alignment_mode, feature_type, id_attribute, additional_attributes, quiet, minaqual, samouts, samout_format, output_delimiter, output_filename, output_append, nprocesses, feature_query, ): '''Count reads in features, parallelizing by file''' # Never use more CPUs than files nprocesses = min(nprocesses, len(sam_filenames)) if samouts != []: if len(samouts) != len(sam_filenames): raise ValueError( 'Select the same number of input and output files') # Try to open samout files early in case any of them has issues if samout_format in ('SAM', 'sam'): for samout in samouts: with open(samout, 'w'): pass else: # We don't have a template if the input is stdin if (len(sam_filenames) != 1) or (sam_filenames[0] != '-'): for sam_filename, samout in zip(sam_filenames, samouts): with pysam.AlignmentFile(sam_filename, 'r') as sf: with pysam.AlignmentFile(samout, 'w', template=sf): pass else: samouts = [None for x in sam_filenames] # Try to open samfiles to fail early in case any of them is not there if (len(sam_filenames) != 1) or (sam_filenames[0] != '-'): for sam_filename in sam_filenames: with pysam.AlignmentFile(sam_filename, 'r') as sf: pass # Prepare features gff = HTSeq.GFF_Reader(gff_filename) feature_scan = HTSeq.make_feature_genomicarrayofsets( gff, id_attribute, feature_type=feature_type, feature_query=feature_query, additional_attributes=additional_attributes, stranded=stranded != 'no', verbose=not quiet, ) features = feature_scan['features'] attributes = feature_scan['attributes'] feature_attr = sorted(attributes.keys()) if len(feature_attr) == 0: sys.stderr.write("Warning: No features of type '%s' found.\n" % feature_type) # Prepare arguments for counting function args = [] for isam, (sam_filename, samout_filename) in enumerate(zip(sam_filenames, samouts)): args.append(( isam, sam_filename, features, feature_attr, order, max_buffer_size, stranded, overlap_mode, multimapped_mode, secondary_alignment_mode, supplementary_alignment_mode, feature_type, id_attribute, additional_attributes, quiet, minaqual, samout_format, samout_filename, )) # Count reads if nprocesses > 1: with multiprocessing.Pool(nprocesses) as pool: results = pool.starmap(count_reads_single_file, args) results.sort(key=operator.itemgetter('isam')) else: results = list(itertools.starmap(count_reads_single_file, args)) # Write output other_features = [ ('__no_feature', 'empty'), ('__ambiguous', 'ambiguous'), ('__too_low_aQual', 'lowqual'), ('__not_aligned', 'notaligned'), ('__alignment_not_unique', 'nonunique'), ] pad = ['' for attr in additional_attributes] for ifn, fn in enumerate(feature_attr): fields = [fn ] + attributes[fn] + [str(r['counts'][fn]) for r in results] line = output_delimiter.join(fields) if output_filename == '': print(line) else: omode = 'a' if output_append or (ifn > 0) else 'w' with open(output_filename, omode) as f: f.write(line) f.write('\n') for title, fn in other_features: fields = [title] + pad + [str(r[fn]) for r in results] line = output_delimiter.join(fields) if output_filename == '': print(line) else: with open(output_filename, 'a') as f: f.write(line) f.write('\n')