Example #1
0
def run(bam, sites_single, sites_multi, skipped, group_by='start', quant='cDNA',
        segmentation=None, mapq_th=0, multimax=50, gap_th=4,
        report_progress=False):
    """
    Identify and quantify cross-linked sites.

    Interpret mapped sites and generate BED file with coordinates and
    number of cross-linked events.

    MAPQ is calculated mapq=int(-10*log10(1-1/Nmap)). By default we set
    the mapq_th to 0 to include all reads. Mapq score is very useful,
    because values coming from STAR are from a very limited set: 0 (5 or
    more multiple hits), 1 (4 or 3 multiple hits), 3 (2 multiple hits),
    255 (single hit)

    Parameters
    ----------
    bam : str
        Input BAM file with mapped reads.
    sites_single : str
        Output BED6 file to store data from single mapped reads.
    sites_multi : str
        Output BED6 file to store data from single and multi-mapped reads.
    skipped : str
        Output BAM file to store reads that do not map as expected by segmentation and
        reference genome sequence. If read's second start does not fall on any of
        segmentation borders, it is considered problematic. If segmentation is not provided,
        every read in two parts with gap longer than gap_th is not used (skipped).
        All such reads are reported to the user for further exploration.
    group_by : str
        Assign score of a read to either 'start', 'middle' or 'end' nucleotide.
    quant : str
        Report number of 'cDNA' or number of 'reads'.
    segmentation : str
        File with custon segmentation format (obtained by ``iCount segment``).
    mapq_th : int
        Ignore hits with MAPQ < mapq_th.
    multimax : int
        Ignore reads, mapped to more than ``multimax`` places.
    report_progress : bool
        Switch to report progress.
    gap_th : int
        Reads with gaps less than gap_th are treated as if they have no gap.

    Returns
    -------
    iCount.Metrics
        Metrics object, storing analysis metadata.

    """
    iCount.log_inputs(LOGGER, level=logging.INFO)  # pylint: disable=protected-access

    assert sites_single.endswith(('.bed', '.bed.gz'))
    assert sites_multi.endswith(('.bed', '.bed.gz'))
    assert skipped.endswith(('.bam'))
    assert quant in ['cDNA', 'reads']
    assert group_by in ['start', 'middle', 'end']

    metrics = iCount.Metrics()

    single, multi = {}, {}
    progress = 0
    for (chrom, strand), new_progress, by_pos in _processs_bam_file(
            bam, metrics, mapq_th, skipped, segmentation, gap_th):
        if report_progress:
            # pylint: disable=protected-access
            progress = iCount._log_progress(new_progress, progress, LOGGER)

        single_by_pos = {}
        multi_by_pos = {}
        for xlink_pos, by_bc in by_pos.items():

            # count single mapped reads only
            _update(single_by_pos, _collapse(xlink_pos, by_bc, group_by, multimax=1))
            # count all reads mapped les than multimax times
            _update(multi_by_pos, _collapse(xlink_pos, by_bc, group_by, multimax=multimax))

        single.setdefault((chrom, strand), {}).update(single_by_pos)
        multi.setdefault((chrom, strand), {}).update(multi_by_pos)

    # Write output
    val_index = ['cDNA', 'reads'].index(quant)
    _save_dict(single, sites_single, val_index=val_index)
    LOGGER.info('Saved to BED file (single mapped reads): %s', sites_single)
    _save_dict(multi, sites_multi, val_index=val_index)
    LOGGER.info('Saved to BED file (multi-mapped reads): %s', sites_multi)

    return metrics
Example #2
0
def run(annotation,
        sites,
        sigxls,
        scores=None,
        features=None,
        group_by='gene_id',
        merge_features=False,
        half_window=3,
        fdr=0.05,
        perms=100,
        rnd_seed=42,
        report_progress=False):
    """
    Find positions with high density of cross-linked sites.

    When determining feature.name, value of the first existing attribute in the
    following tuple is taken::

        ("ID", "gene_name", "transcript_id", "gene_id", "Parent")

    Source in pybedtools:
    https://github.com/daler/pybedtools/blob/master/pybedtools/scripts/annotate.py#L34

    Parameters
    ----------
    annotation : str
        Annotation file in GTF format, obtained from "iCount segment" command.
    sites : str
        File with cross-links in BED6 format.
    sigxls : str
        File name for "sigxls" output. File reports positions with significant
        number of cross-link events. It should have .bed or .bed.gz extension.
    scores : str
        File name for "scores" output. File reports all cross-link events,
        independent from their FDR score It should have .tsv, .csv, .txt or .gz
        extension.
    features : list_str
        Features from annotation to consider. If None, ['gene'] is used.
        Sometimes, it is advised to use ['gene', 'intergenic'].
    group_by : str
        Attribute by which cross-link positions are grouped.
    merge_features : bool
        Treat all features as one when grouping. Has no effect when only one
        feature is given in features parameter.
    half_window : int
        Half-window size.
    fdr : float
        FDR threshold.
    perms : int
        Number of permutations when calculating random distribution.
    rnd_seed : int
        Seed for random generator.
    report_progress : bool
        Report analysis progress.

    Returns
    -------
    iCount.metrics
        Analysis metadata.

    """
    iCount.log_inputs(LOGGER, level=logging.INFO)
    metrics = iCount.Metrics()

    if features is None:
        features = ['gene']
    assert sigxls.endswith(('.bed', '.bed.gz'))
    if scores:
        assert scores.endswith(
            ('.tsv', '.tsv.gz', '.csv', '.csv.gz', 'txt', 'txt.gz'))
    numpy.random.seed(rnd_seed)  # pylint: disable=no-member

    LOGGER.info('Loading annotation file...')
    annotation2 = iCount.files.decompress_to_tempfile(annotation)
    if annotation2 != annotation:
        to_delete_temp = annotation2
        annotation = annotation2
    else:
        to_delete_temp = None
    annotation = pybedtools.BedTool(annotation).saveas()
    metrics.annotation_all = len(annotation)
    annotation = annotation.filter(lambda x: x[2] in features).sort().saveas()
    metrics.annotation_used = len(annotation)
    metrics.annotation_skipped = metrics.annotation_all - metrics.annotation_used
    LOGGER.info('%d out of %d annotation records will be used (%d skipped).',
                metrics.annotation_used, metrics.annotation_all,
                metrics.annotation_skipped)

    LOGGER.info('Loading cross-links file...')
    sites = pybedtools.BedTool(sites).sort().saveas()

    # intersect cross-linked sites with regions
    LOGGER.info(
        'Calculating intersection between annotation and cross-link file...')
    overlaps = annotation.intersect(sites, sorted=True, s=True,
                                    wo=True).saveas()

    groups = {}
    group_sizes = {}
    multi_mode = len(features) > 1 and not merge_features
    LOGGER.info('Processing intersections...')
    for feature in overlaps:
        chrom = feature.chrom
        start = feature.start
        end = feature.stop
        name = feature.name
        strand = feature.strand
        site_chrom = feature.fields[9]
        site_pos = int(feature.fields[10])
        site_end = int(feature.fields[11])
        site_dot = feature.fields[12]
        site_score = float(feature.fields[13])
        site_strand = feature.fields[14]
        assert site_chrom == chrom
        assert site_strand == strand
        assert site_dot == '.'
        assert site_pos == site_end - 1

        # Determine group_id depending on multi_mode...
        group_id = feature.attrs[group_by]
        if multi_mode:
            group_id = feature[2] + '_' + group_id

        groups.setdefault((chrom, strand, group_id, name), []).append(
            (site_pos, site_score))
        group_sizes.setdefault((chrom, strand, group_id, name), set()).add(
            (start, end))

    # Validate that segments in same group do not overlap: start of next feature
    # is greater than stop of the current one:
    for sizes in group_sizes.values():
        sizes = sorted(sizes)
        for first, second in zip(sizes, sizes[1:]):
            assert first[1] < second[0]

    # calculate total length of each group by summing element sizes:
    group_sizes = dict([(name, sum([end - start for start, end in elements]))
                        for name, elements in group_sizes.items()])

    # calculate and assign FDRs to each cross-linked site. FDR values are
    # calculated together for each group.
    results = {}
    metrics.all_groups = len(groups)
    progress, j = 0, 0
    for (chrom, strand, group_id, name), hits in sorted(groups.items()):
        j += 1
        if report_progress:
            new_progress = j / metrics.all_groups
            # pylint: disable=protected-access
            progress = iCount._log_progress(new_progress, progress, LOGGER)

        group_size = group_sizes[(chrom, strand, group_id, name)]

        # Crucial step: each position in a group is given a fdr_score, based on
        # hits in group, group_size, half-window size and number of
        # permutations. Than, FDR scores (+ some other info) are written to
        # `results` container:
        processed = _process_group(hits, group_size, half_window, perms)
        for (pos, val, val_extended, fdr_score) in processed:
            results.setdefault((chrom, pos, strand), []).\
                append((fdr_score, name, group_id, val, val_extended))
    metrics.positions_annotated = len(results)

    # cross-linked sites outside annotated regions
    LOGGER.info('Determining cross-links not intersecting with annotation...')
    skipped = sites.intersect(annotation, sorted=True, s=True, v=True).saveas()
    for feature in skipped:
        site_chrom = feature.chrom
        site_start = feature.start
        site_end = feature.stop
        # site_name = feature.name
        site_score = feature.score
        site_strand = feature.strand
        assert site_start == site_end - 1
        k = (site_chrom, site_start, site_strand)
        assert k not in results
        results.setdefault(k, []).\
            append((1.0, 'not_annotated', 'not_annotated', site_score, 'not_calculated'))

    metrics.positions_all = len(results)
    metrics.positions_not_annotated = metrics.positions_all - metrics.positions_annotated
    LOGGER.info(
        'Significant crosslinks calculation finished. Writing results to files...'
    )

    # Make sigxls: a BED6 file, with only the most significant cross-links:
    metrics.significant_positions = 0
    with iCount.files.gz_open(sigxls, 'wt') as sigxls:
        for (chrom, pos, strand), annot_list in sorted(results.items()):
            annot_list = sorted(annot_list)

            # report minimum fdr_score for each position in BED6
            min_fdr_score = annot_list[0][0]
            if min_fdr_score < fdr:
                metrics.significant_positions += 1
                # position has significant records - report the most significant ones:
                min_fdr_records = [
                    rec for rec in annot_list if rec[0] == min_fdr_score
                ]

                _, names, group_ids, group_scores, _ = zip(*min_fdr_records)
                if names == group_ids:
                    name = ','.join(names)
                else:
                    name = ','.join(names) + '-' + ','.join(group_ids)
                line = [chrom, pos, pos + 1, name, group_scores[0], strand]
                sigxls.write('\t'.join([_f2s(i, dec=4) for i in line]) + '\n')
    LOGGER.info('BED6 file with significant crosslinks saved to: %s',
                sigxls.name)

    # Make scores: a tab-separated file, with ALL cross-links, (no significance threshold)
    header = [
        'chrom', 'position', 'strand', 'name', 'group_id', 'score',
        'score_extended', 'FDR'
    ]
    if scores:
        with iCount.files.gz_open(scores, 'wt') as scores:
            scores.write('\t'.join(header) + '\n')
            for (chrom, pos, strand), annot_list in sorted(results.items()):
                for (fdr_score, name, group_id, score,
                     val_extended) in sorted(annot_list):
                    line = [
                        chrom, pos, strand, name, group_id, score,
                        val_extended, fdr_score
                    ]
                    scores.write('\t'.join([_f2s(i, dec=6)
                                            for i in line]) + '\n')
        LOGGER.info('Scores for each cross-linked position saved to: %s',
                    scores.name)

    if to_delete_temp:
        os.remove(to_delete_temp)

    LOGGER.info('Done.')
    return metrics
Example #3
0
def run(bam,
        segmentation,
        out_file,
        strange,
        cross_transcript,
        implicit_handling='closest',
        mismatches=2,
        mapq_th=0,
        holesize_th=4,
        max_barcodes=10000):
    """
    Compute distribution of cross-links relative to genomic landmarks.

    Parameters
    ----------
    bam : str
        BAM file with alligned reads.
    segmentation : str
        GTF file with segmentation. Should be a file produced by function
        `get_segments`.
    out_file : str
        Output file with analysis results.
    strange : str
        File with strange propertieas obtained when processing bam file.
    cross_transcript : str
        File with reads spanning over multiple transcripts or multiple genes.
    implicit_handling : str
        Can be 'closest' or 'split'. In case of implicit read - split score to
        both neighbours or give it just to the closest neighbour.
    mismatches : int
        Reads on same position with random barcode differing less than
        ``mismatches`` are grouped together.
    mapq_th : int
        Ignore hits with MAPQ < mapq_th.
    holesize_th : int
        Raeads with size of holes less than holesize_th are treted as if they
        would have no holes.
    max_barcodes : int
        Skip merging similar barcodes if number of distinct barcodes at
        position is higher that this.


    Returns
    -------
    str
        File with number of (al, explicit) scores per each position in each
        RNA-map type.

    """
    iCount.logger.log_inputs(LOGGER)

    if implicit_handling not in ('closest', 'split'):
        raise ValueError(
            'Parameter implicit_handling should be one of "closest" or "split"'
        )

    metrics = iCount.Metrics()
    metrics.cross_transcript = 0
    metrics.origin_premrna = 0
    metrics.origin_mrna = 0
    metrics.origin_ambiguous = 0

    # The root container:
    data = {}

    progress = 0
    LOGGER.info('Processing data...')
    # pylint: disable=protected-access
    for (chrom, strand
         ), new_progress, by_pos in iCount.mapping.xlsites._processs_bam_file(
             bam,
             metrics,
             mapq_th,
             strange,
             segmentation=segmentation,
             gap_th=holesize_th):

        # pylint: disable=protected-access
        progress = iCount._log_progress(new_progress, progress, LOGGER)

        # Sort all genes (and intergenic) by start coordinate.
        segmentation_sorted = sorted(
            iCount.genomes.segment._prepare_segmentation(
                segmentation, chrom, strand).items(),
            key=lambda x: x[1]['gene_segment'].start)
        seg_max_index = len(segmentation_sorted) - 1
        start_gene_index, stop_gene_index = 0, seg_max_index

        for xlink_pos, by_bc in sorted(by_pos.items()):
            # pylint: disable=protected-access
            iCount.mapping.xlsites._merge_similar_randomers(
                by_bc, mismatches, max_barcodes)
            # by_bc is modified in place in _merge_similar_randomers

            # reads is a list of reads belonging to given barcode in by_bc
            for reads in by_bc.values():
                ss_groups = {}
                for read in reads:
                    # Define second start groups:
                    ss_groups.setdefault(read[4], []).append(read)

                # Process each second start group:
                for ss_group in ss_groups.values():

                    # The following block extracts just the required genes (& gene_content)
                    # without iterating through all genes/content in chromosome.

                    # Sort reads by length and take the longest one (read_len is 3rd column)!
                    ss_group = sorted(ss_group, key=lambda x: (-x[2]))
                    stop = ss_group[0][1]
                    start = xlink_pos
                    segmentation_subset = []
                    passed_start = False  # Weather the start_gene_index was aready found.
                    for gene_item in segmentation_sorted[start_gene_index:]:
                        gene_segment = gene_item[1]['gene_segment']

                        if gene_segment.start <= start <= gene_segment.stop:
                            start_gene_index = segmentation_sorted.index(
                                gene_item)
                            passed_start = True

                        if passed_start:
                            segmentation_subset.append(gene_item)

                        if gene_segment.start <= stop <= gene_segment.stop:
                            stop_gene_index = segmentation_sorted.index(
                                gene_item)
                            # Append also one gene before (insert on first position to keep sorted)
                            segmentation_subset.insert(
                                0, segmentation_sorted[max(
                                    start_gene_index - 1, 0)])
                            # Append also one gene after:
                            segmentation_subset.append(segmentation_sorted[min(
                                stop_gene_index + 1, seg_max_index)])
                            break
                        # Even if entries repeat, this is still OK, sice
                        # first and last gene neeed to be the ones not including
                        # start/stop!

                    # segmentation_subset is defined. Now process this group:
                    _process_read_group(xlink_pos,
                                        chrom,
                                        strand,
                                        ss_group[0],
                                        data,
                                        segmentation_subset,
                                        metrics,
                                        implicit_handling=implicit_handling)

    LOGGER.info('Writing output files...')

    header = ['RNAmap type', 'position', 'all', 'explicit']
    cross_tr_header = [
        'chrom', 'strand', 'xlink', 'second-start', 'end-position', 'read_len'
    ]
    with open(out_file, 'wt') as ofile, open(cross_transcript, 'wt') as ctfile:
        ofile.write('\t'.join(header) + '\n')
        ctfile.write('\t'.join(cross_tr_header) + '\n')
        for rna_map_type, positions in sorted(data.items()):
            if rna_map_type == 'cross_transcript':
                for (chrom, strand, xlink), read_list in positions.items():
                    for (_, end, read_len, _, second_start) in read_list:
                        ctfile.write('\t'.join(
                            map(str, [
                                chrom, strand, xlink, second_start, end,
                                read_len
                            ])) + '\n')
            else:
                for position, [all_, explic] in sorted(positions.items()):
                    # Round to 4 decimal places with _f2s function:
                    all_, explic = _f2s(all_, dec=4), _f2s(explic, dec=4)
                    ofile.write(
                        '\t'.join([rna_map_type,
                                   str(position), all_, explic]) + '\n')

    LOGGER.info('RNA-maps output written to: %s', out_file)
    LOGGER.info('Reads spanning multiple transcripts written to: %s',
                cross_transcript)
    LOGGER.info('Done.')
    return metrics
Example #4
0
def _get_gene_content(gtf, chromosomes, report_progress=False):
    """
    Generator giving groups of intervals belonging to one gene.

    The yielded structure in each iteration is a dictionary that has
    key-value pairs:

        * 'gene': interval if type gene
        * 'transcript_id#1': intervals corresponding to transcript_id#1
        * 'transcript_id#2': intervals corresponding to transcript_id#2
        ...


    Parameters
    ----------
    gtf : str
        Path to gtf input file.
    chromosomes : list
        List of chromosomes to consider.
    report_progress : bool
        Switch to show progress.

    Returns
    -------
    dict
        All intervals in gene, separated by transcript_id.

    """
    # Lists to keep track of all already processed genes/transcripts:
    gene_ids = []
    transcript_ids = []

    current_transcript = None
    current_gene = None
    gene_content = {}

    def finalize(gene_content):
        """Procedure before returning group of intervals belonging to one gene."""
        if 'gene' not in gene_content:
            # Manually create "gene interval":
            int1 = next(iter(gene_content.values()))[0]
            col8 = _filter_col8(int1)
            start = min([i.start for j in gene_content.values() for i in j])
            stop = max([i.stop for j in gene_content.values() for i in j])
            gene_content['gene'] = create_interval_from_list(
                int1[:2] + ['gene', start + 1, stop] + int1[5:8] + [col8])
        return gene_content

    length = pybedtools.BedTool(gtf).count()
    progress, j = 0, 0
    for interval in pybedtools.BedTool(gtf):
        j += 1
        if report_progress:
            new_progress = j / length
            # pylint: disable=protected-access
            progress = iCount._log_progress(new_progress, progress, LOGGER)

        if interval.chrom in chromosomes:
            # Segments without 'transcript_id' attributes are the ones that
            # define genes. such intervals are not in all releases.
            if interval.attrs['gene_id'] == current_gene:
                if interval.attrs['transcript_id'] == current_transcript:
                    # Same gene, same transcript: just add to container:
                    gene_content[current_transcript].append(interval)
                else:
                    # New transcript - confirm that it is really a new one:
                    current_transcript = interval.attrs['transcript_id']
                    assert current_transcript not in transcript_ids
                    transcript_ids.append(current_transcript)
                    gene_content[current_transcript] = [interval]

            else:  # New gene!
                # First process old content:
                if gene_content:  # To survive the first iteration
                    yield finalize(gene_content)

                # Confirm that it is really new gene!
                assert interval.attrs['gene_id'] not in gene_ids
                # Then add it to already processed genes:
                current_gene = interval.attrs['gene_id']
                gene_ids.append(current_gene)

                # Make empty container and classify interval
                gene_content = {}
                if interval[2] == 'gene':
                    gene_content['gene'] = interval
                elif 'transcript_id' in interval.attrs:
                    current_transcript = interval.attrs['transcript_id']
                    assert current_transcript not in transcript_ids
                    transcript_ids.append(current_transcript)
                    gene_content[current_transcript] = [interval]
                else:
                    raise Exception("Unexpected situation!")

    # for the last iteration:
    yield finalize(gene_content)