コード例 #1
0
def count_reads_in_features(
    sam_filename,
    gff_filename,
    order,
    max_buffer_size,
    stranded,
    overlap_mode,
    multimapped_mode,
    secondary_alignment_mode,
    supplementary_alignment_mode,
    feature_type,
    id_attribute,
    additional_attributes,
    quiet,
    minaqual,
    samout,
    samout_format,
    output_delimiter,
    output_filename,
    cb_tag,
    ub_tag,
):
    '''Count reads in features, parallelizing by file'''

    if samout is not None:
        # Try to open samout file early in case any of them has issues
        if samout_format in ('SAM', 'sam'):
            with open(samout, 'w'):
                pass
        else:
            # We don't have a template if the input is stdin
            if sam_filename != '-':
                with pysam.AlignmentFile(sam_filename, 'r') as sf:
                    with pysam.AlignmentFile(samout, 'w', template=sf):
                        pass

    # Try to open samfiles to fail early in case any of them is not there
    if sam_filename != '-':
        with pysam.AlignmentFile(sam_filename, 'r') as sf:
            pass

    # Prepare features
    gff = HTSeq.GFF_Reader(gff_filename)
    feature_scan = HTSeq.make_feature_genomicarrayofsets(
        gff,
        id_attribute,
        feature_type=feature_type,
        additional_attributes=additional_attributes,
        stranded=stranded != 'no',
        verbose=not quiet,
    )
    features = feature_scan['features']
    attributes = feature_scan['attributes']
    feature_attr = sorted(attributes.keys())

    if len(feature_attr) == 0:
        sys.stderr.write("Warning: No features of type '%s' found.\n" %
                         feature_type)

    # Count reads
    results = count_reads_with_barcodes(
        sam_filename,
        features,
        feature_attr,
        order,
        max_buffer_size,
        stranded,
        overlap_mode,
        multimapped_mode,
        secondary_alignment_mode,
        supplementary_alignment_mode,
        feature_type,
        id_attribute,
        additional_attributes,
        quiet,
        minaqual,
        samout_format,
        samout,
        cb_tag,
        ub_tag,
    )
    # Cell barcodes
    cbs = results['cell_barcodes']
    counts = results['counts']

    # Write output
    other_features = [
        '__no_feature',
        '__ambiguous',
        '__too_low_aQual',
        '__not_aligned',
        '__alignment_not_unique',
    ]
    pad = ['' for attr in additional_attributes]
    # Header
    fields = [''] + pad + cbs
    line = output_delimiter.join(fields)
    if output_filename == '':
        print(line)
    else:
        with open(output_filename, 'w') as f:
            f.write(line)
            f.write('\n')

    # Features
    for ifn, fn in enumerate(feature_attr):
        fields = [fn] + attributes[fn] + [str(counts[cb][fn]) for cb in cbs]
        line = output_delimiter.join(fields)
        if output_filename == '':
            print(line)
        else:
            with open(output_filename, 'a') as f:
                f.write(line)
                f.write('\n')

    # Other features
    for fn in other_features:
        fields = [fn] + pad + [str(counts[cb][fn]) for cb in cbs]
        line = output_delimiter.join(fields)
        if output_filename == '':
            print(line)
        else:
            with open(output_filename, 'a') as f:
                f.write(line)
                f.write('\n')
コード例 #2
0
def count_reads_in_features(args):
    """Count reads in features, parallelizing by file

    Args:
        args: ArgumentParser args, i.e. each argument is a property of this
          instance. Check the CLI parsing function below for a full list
          of properties, i.e. command-line options.

    This function can be conceptually split into the following steps:

    1. Load features from GTF file into memory
    2. Parse the reads from each BAM file in parallel or series
    3. Write output table

    Step 2 can be further split into two main components:

    1. Find what features overlap with each read/read pair
    2. Assign that read/pair to a feature (if unique) or a corner case
       (e.g. multimappers)
    """

    # Load feature GenomicArrayOfSets to mark overlaps
    gff = HTSeq.GFF_Reader(args.featuresfilename)
    feature_scan = HTSeq.make_feature_genomicarrayofsets(
        gff,
        args.idattr,
        feature_type=args.feature_type,
        feature_query=args.feature_query,
        additional_attributes=args.additional_attributes,
        stranded=args.stranded != "no",
        verbose=not args.quiet,
        add_chromosome_info=args.add_chromosome_info,
    )
    features = feature_scan["features"]
    attributes = feature_scan["attributes"]
    feature_attr = sorted(attributes.keys())
    if len(feature_attr) == 0:
        sys.stderr.write(
            "Warning: No features of type '{args.feature_type}' found.\n")

    # Count reads in parallel or in series
    count_args, attributes = _prepare_args_for_counting(
        features,
        feature_attr,
        attributes,
        args.add_chromosome_info,
        args.additional_attributes,
        args.feature_query,
        args.feature_type,
        args.featuresfilename,
        args.idattr,
        args.max_buffer_size,
        args.minaqual,
        args.nonunique,
        args.order,
        args.mode,
        args.quiet,
        args.samfilenames,
        args.samout_format,
        args.samouts,
        args.secondary_alignments,
        args.stranded,
        args.supplementary_alignments,
    )
    if args.nprocesses > 1:
        with multiprocessing.Pool(args.nprocesses) as pool:
            results = pool.starmap(count_reads_single_file, count_args)
        results.sort(key=operator.itemgetter("isam"))
    else:
        results = list(itertools.starmap(count_reads_single_file, count_args))

    # Merge and write output
    _write_output(
        results,
        args.samfilenames,
        attributes,
        args.additional_attributes,
        args.output_filename,
        args.output_delimiter,
        args.output_append,
        sparse=args.counts_output_sparse,
        dtype=np.float32,
    )
コード例 #3
0
def count_reads_in_features(
    sam_filenames,
    gff_filename,
    order,
    max_buffer_size,
    stranded,
    overlap_mode,
    multimapped_mode,
    secondary_alignment_mode,
    supplementary_alignment_mode,
    feature_type,
    id_attribute,
    additional_attributes,
    quiet,
    minaqual,
    samouts,
    samout_format,
    output_delimiter,
    output_filename,
    output_append,
    nprocesses,
    feature_query,
):
    '''Count reads in features, parallelizing by file'''

    # Never use more CPUs than files
    nprocesses = min(nprocesses, len(sam_filenames))

    if samouts != []:
        if len(samouts) != len(sam_filenames):
            raise ValueError(
                'Select the same number of input and output files')
        # Try to open samout files early in case any of them has issues
        if samout_format in ('SAM', 'sam'):
            for samout in samouts:
                with open(samout, 'w'):
                    pass
        else:
            # We don't have a template if the input is stdin
            if (len(sam_filenames) != 1) or (sam_filenames[0] != '-'):
                for sam_filename, samout in zip(sam_filenames, samouts):
                    with pysam.AlignmentFile(sam_filename, 'r') as sf:
                        with pysam.AlignmentFile(samout, 'w', template=sf):
                            pass
    else:
        samouts = [None for x in sam_filenames]

    # Try to open samfiles to fail early in case any of them is not there
    if (len(sam_filenames) != 1) or (sam_filenames[0] != '-'):
        for sam_filename in sam_filenames:
            with pysam.AlignmentFile(sam_filename, 'r') as sf:
                pass

    # Prepare features
    gff = HTSeq.GFF_Reader(gff_filename)
    feature_scan = HTSeq.make_feature_genomicarrayofsets(
        gff,
        id_attribute,
        feature_type=feature_type,
        feature_query=feature_query,
        additional_attributes=additional_attributes,
        stranded=stranded != 'no',
        verbose=not quiet,
    )
    features = feature_scan['features']
    attributes = feature_scan['attributes']
    feature_attr = sorted(attributes.keys())

    if len(feature_attr) == 0:
        sys.stderr.write("Warning: No features of type '%s' found.\n" %
                         feature_type)

    # Prepare arguments for counting function
    args = []
    for isam, (sam_filename,
               samout_filename) in enumerate(zip(sam_filenames, samouts)):
        args.append((
            isam,
            sam_filename,
            features,
            feature_attr,
            order,
            max_buffer_size,
            stranded,
            overlap_mode,
            multimapped_mode,
            secondary_alignment_mode,
            supplementary_alignment_mode,
            feature_type,
            id_attribute,
            additional_attributes,
            quiet,
            minaqual,
            samout_format,
            samout_filename,
        ))

    # Count reads
    if nprocesses > 1:
        with multiprocessing.Pool(nprocesses) as pool:
            results = pool.starmap(count_reads_single_file, args)
        results.sort(key=operator.itemgetter('isam'))
    else:
        results = list(itertools.starmap(count_reads_single_file, args))

    # Write output
    other_features = [
        ('__no_feature', 'empty'),
        ('__ambiguous', 'ambiguous'),
        ('__too_low_aQual', 'lowqual'),
        ('__not_aligned', 'notaligned'),
        ('__alignment_not_unique', 'nonunique'),
    ]
    pad = ['' for attr in additional_attributes]
    for ifn, fn in enumerate(feature_attr):
        fields = [fn
                  ] + attributes[fn] + [str(r['counts'][fn]) for r in results]
        line = output_delimiter.join(fields)
        if output_filename == '':
            print(line)
        else:
            omode = 'a' if output_append or (ifn > 0) else 'w'
            with open(output_filename, omode) as f:
                f.write(line)
                f.write('\n')

    for title, fn in other_features:
        fields = [title] + pad + [str(r[fn]) for r in results]
        line = output_delimiter.join(fields)
        if output_filename == '':
            print(line)
        else:
            with open(output_filename, 'a') as f:
                f.write(line)
                f.write('\n')