Esempio n. 1
0
def _iter_bed_dict(bed, val_index=None):
    """Iterate through dict object."""
    if val_index is not None:
        for (chrome, strand), by_pos in bed.items():
            for pos, val in by_pos.items():
                val = val[val_index]
                yield pybedtools.create_interval_from_list(
                    [chrome, pos, pos + 1, '.', _f2s(val), strand]
                )
    else:
        for (chrome, strand), by_pos in bed.items():
            for pos, val in by_pos.items():
                yield pybedtools.create_interval_from_list(
                    [chrome, pos, pos + 1, '.', _f2s(val), strand]
                )
Esempio n. 2
0
def run(bam,
        segmentation,
        out_file,
        strange,
        cross_transcript,
        implicit_handling='closest',
        mismatches=2,
        mapq_th=0,
        holesize_th=4,
        max_barcodes=10000):
    """
    Compute distribution of cross-links relative to genomic landmarks.

    Parameters
    ----------
    bam : str
        BAM file with alligned reads.
    segmentation : str
        GTF file with segmentation. Should be a file produced by function
        `get_segments`.
    out_file : str
        Output file with analysis results.
    strange : str
        File with strange propertieas obtained when processing bam file.
    cross_transcript : str
        File with reads spanning over multiple transcripts or multiple genes.
    implicit_handling : str
        Can be 'closest' or 'split'. In case of implicit read - split score to
        both neighbours or give it just to the closest neighbour.
    mismatches : int
        Reads on same position with random barcode differing less than
        ``mismatches`` are grouped together.
    mapq_th : int
        Ignore hits with MAPQ < mapq_th.
    holesize_th : int
        Raeads with size of holes less than holesize_th are treted as if they
        would have no holes.
    max_barcodes : int
        Skip merging similar barcodes if number of distinct barcodes at
        position is higher that this.


    Returns
    -------
    str
        File with number of (al, explicit) scores per each position in each
        RNA-map type.

    """
    iCount.logger.log_inputs(LOGGER)

    if implicit_handling not in ('closest', 'split'):
        raise ValueError(
            'Parameter implicit_handling should be one of "closest" or "split"'
        )

    metrics = iCount.Metrics()
    metrics.cross_transcript = 0
    metrics.origin_premrna = 0
    metrics.origin_mrna = 0
    metrics.origin_ambiguous = 0

    # The root container:
    data = {}

    progress = 0
    LOGGER.info('Processing data...')
    # pylint: disable=protected-access
    for (chrom, strand
         ), new_progress, by_pos in iCount.mapping.xlsites._processs_bam_file(
             bam,
             metrics,
             mapq_th,
             strange,
             segmentation=segmentation,
             gap_th=holesize_th):

        # pylint: disable=protected-access
        progress = iCount._log_progress(new_progress, progress, LOGGER)

        # Sort all genes (and intergenic) by start coordinate.
        segmentation_sorted = sorted(
            iCount.genomes.segment._prepare_segmentation(
                segmentation, chrom, strand).items(),
            key=lambda x: x[1]['gene_segment'].start)
        seg_max_index = len(segmentation_sorted) - 1
        start_gene_index, stop_gene_index = 0, seg_max_index

        for xlink_pos, by_bc in sorted(by_pos.items()):
            # pylint: disable=protected-access
            iCount.mapping.xlsites._merge_similar_randomers(
                by_bc, mismatches, max_barcodes)
            # by_bc is modified in place in _merge_similar_randomers

            # reads is a list of reads belonging to given barcode in by_bc
            for reads in by_bc.values():
                ss_groups = {}
                for read in reads:
                    # Define second start groups:
                    ss_groups.setdefault(read[4], []).append(read)

                # Process each second start group:
                for ss_group in ss_groups.values():

                    # The following block extracts just the required genes (& gene_content)
                    # without iterating through all genes/content in chromosome.

                    # Sort reads by length and take the longest one (read_len is 3rd column)!
                    ss_group = sorted(ss_group, key=lambda x: (-x[2]))
                    stop = ss_group[0][1]
                    start = xlink_pos
                    segmentation_subset = []
                    passed_start = False  # Weather the start_gene_index was aready found.
                    for gene_item in segmentation_sorted[start_gene_index:]:
                        gene_segment = gene_item[1]['gene_segment']

                        if gene_segment.start <= start <= gene_segment.stop:
                            start_gene_index = segmentation_sorted.index(
                                gene_item)
                            passed_start = True

                        if passed_start:
                            segmentation_subset.append(gene_item)

                        if gene_segment.start <= stop <= gene_segment.stop:
                            stop_gene_index = segmentation_sorted.index(
                                gene_item)
                            # Append also one gene before (insert on first position to keep sorted)
                            segmentation_subset.insert(
                                0, segmentation_sorted[max(
                                    start_gene_index - 1, 0)])
                            # Append also one gene after:
                            segmentation_subset.append(segmentation_sorted[min(
                                stop_gene_index + 1, seg_max_index)])
                            break
                        # Even if entries repeat, this is still OK, sice
                        # first and last gene neeed to be the ones not including
                        # start/stop!

                    # segmentation_subset is defined. Now process this group:
                    _process_read_group(xlink_pos,
                                        chrom,
                                        strand,
                                        ss_group[0],
                                        data,
                                        segmentation_subset,
                                        metrics,
                                        implicit_handling=implicit_handling)

    LOGGER.info('Writing output files...')

    header = ['RNAmap type', 'position', 'all', 'explicit']
    cross_tr_header = [
        'chrom', 'strand', 'xlink', 'second-start', 'end-position', 'read_len'
    ]
    with open(out_file, 'wt') as ofile, open(cross_transcript, 'wt') as ctfile:
        ofile.write('\t'.join(header) + '\n')
        ctfile.write('\t'.join(cross_tr_header) + '\n')
        for rna_map_type, positions in sorted(data.items()):
            if rna_map_type == 'cross_transcript':
                for (chrom, strand, xlink), read_list in positions.items():
                    for (_, end, read_len, _, second_start) in read_list:
                        ctfile.write('\t'.join(
                            map(str, [
                                chrom, strand, xlink, second_start, end,
                                read_len
                            ])) + '\n')
            else:
                for position, [all_, explic] in sorted(positions.items()):
                    # Round to 4 decimal places with _f2s function:
                    all_, explic = _f2s(all_, dec=4), _f2s(explic, dec=4)
                    ofile.write(
                        '\t'.join([rna_map_type,
                                   str(position), all_, explic]) + '\n')

    LOGGER.info('RNA-maps output written to: %s', out_file)
    LOGGER.info('Reads spanning multiple transcripts written to: %s',
                cross_transcript)
    LOGGER.info('Done.')
    return metrics
Esempio n. 3
0
def run(annotation,
        sites,
        sigxls,
        scores=None,
        features=None,
        group_by='gene_id',
        merge_features=False,
        half_window=3,
        fdr=0.05,
        perms=100,
        rnd_seed=42,
        report_progress=False):
    """
    Find positions with high density of cross-linked sites.

    When determining feature.name, value of the first existing attribute in the
    following tuple is taken::

        ("ID", "gene_name", "transcript_id", "gene_id", "Parent")

    Source in pybedtools:
    https://github.com/daler/pybedtools/blob/master/pybedtools/scripts/annotate.py#L34

    Parameters
    ----------
    annotation : str
        Annotation file in GTF format, obtained from "iCount segment" command.
    sites : str
        File with cross-links in BED6 format.
    sigxls : str
        File name for "sigxls" output. File reports positions with significant
        number of cross-link events. It should have .bed or .bed.gz extension.
    scores : str
        File name for "scores" output. File reports all cross-link events,
        independent from their FDR score It should have .tsv, .csv, .txt or .gz
        extension.
    features : list_str
        Features from annotation to consider. If None, ['gene'] is used.
        Sometimes, it is advised to use ['gene', 'intergenic'].
    group_by : str
        Attribute by which cross-link positions are grouped.
    merge_features : bool
        Treat all features as one when grouping. Has no effect when only one
        feature is given in features parameter.
    half_window : int
        Half-window size.
    fdr : float
        FDR threshold.
    perms : int
        Number of permutations when calculating random distribution.
    rnd_seed : int
        Seed for random generator.
    report_progress : bool
        Report analysis progress.

    Returns
    -------
    iCount.metrics
        Analysis metadata.

    """
    iCount.log_inputs(LOGGER, level=logging.INFO)
    metrics = iCount.Metrics()

    if features is None:
        features = ['gene']
    assert sigxls.endswith(('.bed', '.bed.gz'))
    if scores:
        assert scores.endswith(
            ('.tsv', '.tsv.gz', '.csv', '.csv.gz', 'txt', 'txt.gz'))
    numpy.random.seed(rnd_seed)  # pylint: disable=no-member

    LOGGER.info('Loading annotation file...')
    annotation2 = iCount.files.decompress_to_tempfile(annotation)
    if annotation2 != annotation:
        to_delete_temp = annotation2
        annotation = annotation2
    else:
        to_delete_temp = None
    annotation = pybedtools.BedTool(annotation).saveas()
    metrics.annotation_all = len(annotation)
    annotation = annotation.filter(lambda x: x[2] in features).sort().saveas()
    metrics.annotation_used = len(annotation)
    metrics.annotation_skipped = metrics.annotation_all - metrics.annotation_used
    LOGGER.info('%d out of %d annotation records will be used (%d skipped).',
                metrics.annotation_used, metrics.annotation_all,
                metrics.annotation_skipped)

    LOGGER.info('Loading cross-links file...')
    sites = pybedtools.BedTool(sites).sort().saveas()

    # intersect cross-linked sites with regions
    LOGGER.info(
        'Calculating intersection between annotation and cross-link file...')
    overlaps = annotation.intersect(sites, sorted=True, s=True,
                                    wo=True).saveas()

    groups = {}
    group_sizes = {}
    multi_mode = len(features) > 1 and not merge_features
    LOGGER.info('Processing intersections...')
    for feature in overlaps:
        chrom = feature.chrom
        start = feature.start
        end = feature.stop
        name = feature.name
        strand = feature.strand
        site_chrom = feature.fields[9]
        site_pos = int(feature.fields[10])
        site_end = int(feature.fields[11])
        site_dot = feature.fields[12]
        site_score = float(feature.fields[13])
        site_strand = feature.fields[14]
        assert site_chrom == chrom
        assert site_strand == strand
        assert site_dot == '.'
        assert site_pos == site_end - 1

        # Determine group_id depending on multi_mode...
        group_id = feature.attrs[group_by]
        if multi_mode:
            group_id = feature[2] + '_' + group_id

        groups.setdefault((chrom, strand, group_id, name), []).append(
            (site_pos, site_score))
        group_sizes.setdefault((chrom, strand, group_id, name), set()).add(
            (start, end))

    # Validate that segments in same group do not overlap: start of next feature
    # is greater than stop of the current one:
    for sizes in group_sizes.values():
        sizes = sorted(sizes)
        for first, second in zip(sizes, sizes[1:]):
            assert first[1] < second[0]

    # calculate total length of each group by summing element sizes:
    group_sizes = dict([(name, sum([end - start for start, end in elements]))
                        for name, elements in group_sizes.items()])

    # calculate and assign FDRs to each cross-linked site. FDR values are
    # calculated together for each group.
    results = {}
    metrics.all_groups = len(groups)
    progress, j = 0, 0
    for (chrom, strand, group_id, name), hits in sorted(groups.items()):
        j += 1
        if report_progress:
            new_progress = j / metrics.all_groups
            # pylint: disable=protected-access
            progress = iCount._log_progress(new_progress, progress, LOGGER)

        group_size = group_sizes[(chrom, strand, group_id, name)]

        # Crucial step: each position in a group is given a fdr_score, based on
        # hits in group, group_size, half-window size and number of
        # permutations. Than, FDR scores (+ some other info) are written to
        # `results` container:
        processed = _process_group(hits, group_size, half_window, perms)
        for (pos, val, val_extended, fdr_score) in processed:
            results.setdefault((chrom, pos, strand), []).\
                append((fdr_score, name, group_id, val, val_extended))
    metrics.positions_annotated = len(results)

    # cross-linked sites outside annotated regions
    LOGGER.info('Determining cross-links not intersecting with annotation...')
    skipped = sites.intersect(annotation, sorted=True, s=True, v=True).saveas()
    for feature in skipped:
        site_chrom = feature.chrom
        site_start = feature.start
        site_end = feature.stop
        # site_name = feature.name
        site_score = feature.score
        site_strand = feature.strand
        assert site_start == site_end - 1
        k = (site_chrom, site_start, site_strand)
        assert k not in results
        results.setdefault(k, []).\
            append((1.0, 'not_annotated', 'not_annotated', site_score, 'not_calculated'))

    metrics.positions_all = len(results)
    metrics.positions_not_annotated = metrics.positions_all - metrics.positions_annotated
    LOGGER.info(
        'Significant crosslinks calculation finished. Writing results to files...'
    )

    # Make sigxls: a BED6 file, with only the most significant cross-links:
    metrics.significant_positions = 0
    with iCount.files.gz_open(sigxls, 'wt') as sigxls:
        for (chrom, pos, strand), annot_list in sorted(results.items()):
            annot_list = sorted(annot_list)

            # report minimum fdr_score for each position in BED6
            min_fdr_score = annot_list[0][0]
            if min_fdr_score < fdr:
                metrics.significant_positions += 1
                # position has significant records - report the most significant ones:
                min_fdr_records = [
                    rec for rec in annot_list if rec[0] == min_fdr_score
                ]

                _, names, group_ids, group_scores, _ = zip(*min_fdr_records)
                if names == group_ids:
                    name = ','.join(names)
                else:
                    name = ','.join(names) + '-' + ','.join(group_ids)
                line = [chrom, pos, pos + 1, name, group_scores[0], strand]
                sigxls.write('\t'.join([_f2s(i, dec=4) for i in line]) + '\n')
    LOGGER.info('BED6 file with significant crosslinks saved to: %s',
                sigxls.name)

    # Make scores: a tab-separated file, with ALL cross-links, (no significance threshold)
    header = [
        'chrom', 'position', 'strand', 'name', 'group_id', 'score',
        'score_extended', 'FDR'
    ]
    if scores:
        with iCount.files.gz_open(scores, 'wt') as scores:
            scores.write('\t'.join(header) + '\n')
            for (chrom, pos, strand), annot_list in sorted(results.items()):
                for (fdr_score, name, group_id, score,
                     val_extended) in sorted(annot_list):
                    line = [
                        chrom, pos, strand, name, group_id, score,
                        val_extended, fdr_score
                    ]
                    scores.write('\t'.join([_f2s(i, dec=6)
                                            for i in line]) + '\n')
        LOGGER.info('Scores for each cross-linked position saved to: %s',
                    scores.name)

    if to_delete_temp:
        os.remove(to_delete_temp)

    LOGGER.info('Done.')
    return metrics