def _iter_bed_dict(bed, val_index=None): """Iterate through dict object.""" if val_index is not None: for (chrome, strand), by_pos in bed.items(): for pos, val in by_pos.items(): val = val[val_index] yield pybedtools.create_interval_from_list( [chrome, pos, pos + 1, '.', _f2s(val), strand] ) else: for (chrome, strand), by_pos in bed.items(): for pos, val in by_pos.items(): yield pybedtools.create_interval_from_list( [chrome, pos, pos + 1, '.', _f2s(val), strand] )
def run(bam, segmentation, out_file, strange, cross_transcript, implicit_handling='closest', mismatches=2, mapq_th=0, holesize_th=4, max_barcodes=10000): """ Compute distribution of cross-links relative to genomic landmarks. Parameters ---------- bam : str BAM file with alligned reads. segmentation : str GTF file with segmentation. Should be a file produced by function `get_segments`. out_file : str Output file with analysis results. strange : str File with strange propertieas obtained when processing bam file. cross_transcript : str File with reads spanning over multiple transcripts or multiple genes. implicit_handling : str Can be 'closest' or 'split'. In case of implicit read - split score to both neighbours or give it just to the closest neighbour. mismatches : int Reads on same position with random barcode differing less than ``mismatches`` are grouped together. mapq_th : int Ignore hits with MAPQ < mapq_th. holesize_th : int Raeads with size of holes less than holesize_th are treted as if they would have no holes. max_barcodes : int Skip merging similar barcodes if number of distinct barcodes at position is higher that this. Returns ------- str File with number of (al, explicit) scores per each position in each RNA-map type. """ iCount.logger.log_inputs(LOGGER) if implicit_handling not in ('closest', 'split'): raise ValueError( 'Parameter implicit_handling should be one of "closest" or "split"' ) metrics = iCount.Metrics() metrics.cross_transcript = 0 metrics.origin_premrna = 0 metrics.origin_mrna = 0 metrics.origin_ambiguous = 0 # The root container: data = {} progress = 0 LOGGER.info('Processing data...') # pylint: disable=protected-access for (chrom, strand ), new_progress, by_pos in iCount.mapping.xlsites._processs_bam_file( bam, metrics, mapq_th, strange, segmentation=segmentation, gap_th=holesize_th): # pylint: disable=protected-access progress = iCount._log_progress(new_progress, progress, LOGGER) # Sort all genes (and intergenic) by start coordinate. segmentation_sorted = sorted( iCount.genomes.segment._prepare_segmentation( segmentation, chrom, strand).items(), key=lambda x: x[1]['gene_segment'].start) seg_max_index = len(segmentation_sorted) - 1 start_gene_index, stop_gene_index = 0, seg_max_index for xlink_pos, by_bc in sorted(by_pos.items()): # pylint: disable=protected-access iCount.mapping.xlsites._merge_similar_randomers( by_bc, mismatches, max_barcodes) # by_bc is modified in place in _merge_similar_randomers # reads is a list of reads belonging to given barcode in by_bc for reads in by_bc.values(): ss_groups = {} for read in reads: # Define second start groups: ss_groups.setdefault(read[4], []).append(read) # Process each second start group: for ss_group in ss_groups.values(): # The following block extracts just the required genes (& gene_content) # without iterating through all genes/content in chromosome. # Sort reads by length and take the longest one (read_len is 3rd column)! ss_group = sorted(ss_group, key=lambda x: (-x[2])) stop = ss_group[0][1] start = xlink_pos segmentation_subset = [] passed_start = False # Weather the start_gene_index was aready found. for gene_item in segmentation_sorted[start_gene_index:]: gene_segment = gene_item[1]['gene_segment'] if gene_segment.start <= start <= gene_segment.stop: start_gene_index = segmentation_sorted.index( gene_item) passed_start = True if passed_start: segmentation_subset.append(gene_item) if gene_segment.start <= stop <= gene_segment.stop: stop_gene_index = segmentation_sorted.index( gene_item) # Append also one gene before (insert on first position to keep sorted) segmentation_subset.insert( 0, segmentation_sorted[max( start_gene_index - 1, 0)]) # Append also one gene after: segmentation_subset.append(segmentation_sorted[min( stop_gene_index + 1, seg_max_index)]) break # Even if entries repeat, this is still OK, sice # first and last gene neeed to be the ones not including # start/stop! # segmentation_subset is defined. Now process this group: _process_read_group(xlink_pos, chrom, strand, ss_group[0], data, segmentation_subset, metrics, implicit_handling=implicit_handling) LOGGER.info('Writing output files...') header = ['RNAmap type', 'position', 'all', 'explicit'] cross_tr_header = [ 'chrom', 'strand', 'xlink', 'second-start', 'end-position', 'read_len' ] with open(out_file, 'wt') as ofile, open(cross_transcript, 'wt') as ctfile: ofile.write('\t'.join(header) + '\n') ctfile.write('\t'.join(cross_tr_header) + '\n') for rna_map_type, positions in sorted(data.items()): if rna_map_type == 'cross_transcript': for (chrom, strand, xlink), read_list in positions.items(): for (_, end, read_len, _, second_start) in read_list: ctfile.write('\t'.join( map(str, [ chrom, strand, xlink, second_start, end, read_len ])) + '\n') else: for position, [all_, explic] in sorted(positions.items()): # Round to 4 decimal places with _f2s function: all_, explic = _f2s(all_, dec=4), _f2s(explic, dec=4) ofile.write( '\t'.join([rna_map_type, str(position), all_, explic]) + '\n') LOGGER.info('RNA-maps output written to: %s', out_file) LOGGER.info('Reads spanning multiple transcripts written to: %s', cross_transcript) LOGGER.info('Done.') return metrics
def run(annotation, sites, sigxls, scores=None, features=None, group_by='gene_id', merge_features=False, half_window=3, fdr=0.05, perms=100, rnd_seed=42, report_progress=False): """ Find positions with high density of cross-linked sites. When determining feature.name, value of the first existing attribute in the following tuple is taken:: ("ID", "gene_name", "transcript_id", "gene_id", "Parent") Source in pybedtools: https://github.com/daler/pybedtools/blob/master/pybedtools/scripts/annotate.py#L34 Parameters ---------- annotation : str Annotation file in GTF format, obtained from "iCount segment" command. sites : str File with cross-links in BED6 format. sigxls : str File name for "sigxls" output. File reports positions with significant number of cross-link events. It should have .bed or .bed.gz extension. scores : str File name for "scores" output. File reports all cross-link events, independent from their FDR score It should have .tsv, .csv, .txt or .gz extension. features : list_str Features from annotation to consider. If None, ['gene'] is used. Sometimes, it is advised to use ['gene', 'intergenic']. group_by : str Attribute by which cross-link positions are grouped. merge_features : bool Treat all features as one when grouping. Has no effect when only one feature is given in features parameter. half_window : int Half-window size. fdr : float FDR threshold. perms : int Number of permutations when calculating random distribution. rnd_seed : int Seed for random generator. report_progress : bool Report analysis progress. Returns ------- iCount.metrics Analysis metadata. """ iCount.log_inputs(LOGGER, level=logging.INFO) metrics = iCount.Metrics() if features is None: features = ['gene'] assert sigxls.endswith(('.bed', '.bed.gz')) if scores: assert scores.endswith( ('.tsv', '.tsv.gz', '.csv', '.csv.gz', 'txt', 'txt.gz')) numpy.random.seed(rnd_seed) # pylint: disable=no-member LOGGER.info('Loading annotation file...') annotation2 = iCount.files.decompress_to_tempfile(annotation) if annotation2 != annotation: to_delete_temp = annotation2 annotation = annotation2 else: to_delete_temp = None annotation = pybedtools.BedTool(annotation).saveas() metrics.annotation_all = len(annotation) annotation = annotation.filter(lambda x: x[2] in features).sort().saveas() metrics.annotation_used = len(annotation) metrics.annotation_skipped = metrics.annotation_all - metrics.annotation_used LOGGER.info('%d out of %d annotation records will be used (%d skipped).', metrics.annotation_used, metrics.annotation_all, metrics.annotation_skipped) LOGGER.info('Loading cross-links file...') sites = pybedtools.BedTool(sites).sort().saveas() # intersect cross-linked sites with regions LOGGER.info( 'Calculating intersection between annotation and cross-link file...') overlaps = annotation.intersect(sites, sorted=True, s=True, wo=True).saveas() groups = {} group_sizes = {} multi_mode = len(features) > 1 and not merge_features LOGGER.info('Processing intersections...') for feature in overlaps: chrom = feature.chrom start = feature.start end = feature.stop name = feature.name strand = feature.strand site_chrom = feature.fields[9] site_pos = int(feature.fields[10]) site_end = int(feature.fields[11]) site_dot = feature.fields[12] site_score = float(feature.fields[13]) site_strand = feature.fields[14] assert site_chrom == chrom assert site_strand == strand assert site_dot == '.' assert site_pos == site_end - 1 # Determine group_id depending on multi_mode... group_id = feature.attrs[group_by] if multi_mode: group_id = feature[2] + '_' + group_id groups.setdefault((chrom, strand, group_id, name), []).append( (site_pos, site_score)) group_sizes.setdefault((chrom, strand, group_id, name), set()).add( (start, end)) # Validate that segments in same group do not overlap: start of next feature # is greater than stop of the current one: for sizes in group_sizes.values(): sizes = sorted(sizes) for first, second in zip(sizes, sizes[1:]): assert first[1] < second[0] # calculate total length of each group by summing element sizes: group_sizes = dict([(name, sum([end - start for start, end in elements])) for name, elements in group_sizes.items()]) # calculate and assign FDRs to each cross-linked site. FDR values are # calculated together for each group. results = {} metrics.all_groups = len(groups) progress, j = 0, 0 for (chrom, strand, group_id, name), hits in sorted(groups.items()): j += 1 if report_progress: new_progress = j / metrics.all_groups # pylint: disable=protected-access progress = iCount._log_progress(new_progress, progress, LOGGER) group_size = group_sizes[(chrom, strand, group_id, name)] # Crucial step: each position in a group is given a fdr_score, based on # hits in group, group_size, half-window size and number of # permutations. Than, FDR scores (+ some other info) are written to # `results` container: processed = _process_group(hits, group_size, half_window, perms) for (pos, val, val_extended, fdr_score) in processed: results.setdefault((chrom, pos, strand), []).\ append((fdr_score, name, group_id, val, val_extended)) metrics.positions_annotated = len(results) # cross-linked sites outside annotated regions LOGGER.info('Determining cross-links not intersecting with annotation...') skipped = sites.intersect(annotation, sorted=True, s=True, v=True).saveas() for feature in skipped: site_chrom = feature.chrom site_start = feature.start site_end = feature.stop # site_name = feature.name site_score = feature.score site_strand = feature.strand assert site_start == site_end - 1 k = (site_chrom, site_start, site_strand) assert k not in results results.setdefault(k, []).\ append((1.0, 'not_annotated', 'not_annotated', site_score, 'not_calculated')) metrics.positions_all = len(results) metrics.positions_not_annotated = metrics.positions_all - metrics.positions_annotated LOGGER.info( 'Significant crosslinks calculation finished. Writing results to files...' ) # Make sigxls: a BED6 file, with only the most significant cross-links: metrics.significant_positions = 0 with iCount.files.gz_open(sigxls, 'wt') as sigxls: for (chrom, pos, strand), annot_list in sorted(results.items()): annot_list = sorted(annot_list) # report minimum fdr_score for each position in BED6 min_fdr_score = annot_list[0][0] if min_fdr_score < fdr: metrics.significant_positions += 1 # position has significant records - report the most significant ones: min_fdr_records = [ rec for rec in annot_list if rec[0] == min_fdr_score ] _, names, group_ids, group_scores, _ = zip(*min_fdr_records) if names == group_ids: name = ','.join(names) else: name = ','.join(names) + '-' + ','.join(group_ids) line = [chrom, pos, pos + 1, name, group_scores[0], strand] sigxls.write('\t'.join([_f2s(i, dec=4) for i in line]) + '\n') LOGGER.info('BED6 file with significant crosslinks saved to: %s', sigxls.name) # Make scores: a tab-separated file, with ALL cross-links, (no significance threshold) header = [ 'chrom', 'position', 'strand', 'name', 'group_id', 'score', 'score_extended', 'FDR' ] if scores: with iCount.files.gz_open(scores, 'wt') as scores: scores.write('\t'.join(header) + '\n') for (chrom, pos, strand), annot_list in sorted(results.items()): for (fdr_score, name, group_id, score, val_extended) in sorted(annot_list): line = [ chrom, pos, strand, name, group_id, score, val_extended, fdr_score ] scores.write('\t'.join([_f2s(i, dec=6) for i in line]) + '\n') LOGGER.info('Scores for each cross-linked position saved to: %s', scores.name) if to_delete_temp: os.remove(to_delete_temp) LOGGER.info('Done.') return metrics