def run(sites, peaks, clusters, dist=20, slop=3): """ Join neighboring peaks (at distance dist) into clusters. Report sum of sites' scores within each cluster, including slop. Parameters ---------- sites : str Path to input BED6 file with sites. peaks : str Path to input BED6 file with peaks (or clusters). clusters : str Path to output BED6 file with merged peaks (clusters). dist : int Distance between two peaks to merge into same cluster. slop : int Distance between site and cluster to assign site to cluster. Returns ------- str BED file with clusters as elements. """ iCount.log_inputs(LOGGER, level=logging.INFO) metrics = iCount.Metrics() if slop >= dist: LOGGER.warning('Distance between peaks (%s) should be larger than cluster slop (' '%s)', dist, slop) # It is required to pre-sort your data: LOGGER.info('Reading individual sites from %s', sites) bt_sites = pybedtools.BedTool(sites).sort().saveas() LOGGER.info('Reading peaks from %s', peaks) bt_peaks = pybedtools.BedTool(peaks).sort().saveas() LOGGER.info('Merging peaks to form clusters') merged = bt_peaks.merge(s=True, d=dist, c=[4], o='distinct').saveas() bt_merged = merged.each(_fix_bed6_zeroscore).sort().saveas() LOGGER.info('Summing sites within identified clusters') # for each site, find closest cluster to which assign the site to bt_selected_sites = bt_sites.closest(bt_merged, s=True, d=True, t='first', stream=True).\ filter(lambda b: 0 <= int(b.fields[-1]) <= slop).saveas() bt_selected_sites = bt_selected_sites.each(_select_bed6_noname).sort().saveas() # merge selected sites and previously identified clusters merged = bt_selected_sites.cat(bt_merged, postmerge=True, s=True, d=slop, c=[5, 4], o='sum,distinct').saveas() out = merged.each(_fix_bed6).sort().saveas(clusters) LOGGER.info('Done. Results saved to: %s', os.path.abspath(out.fn)) return metrics
def run(bam, sites_single, sites_multi, skipped, group_by='start', quant='cDNA', segmentation=None, mapq_th=0, multimax=50, gap_th=4, report_progress=False): """ Identify and quantify cross-linked sites. Interpret mapped sites and generate BED file with coordinates and number of cross-linked events. MAPQ is calculated mapq=int(-10*log10(1-1/Nmap)). By default we set the mapq_th to 0 to include all reads. Mapq score is very useful, because values coming from STAR are from a very limited set: 0 (5 or more multiple hits), 1 (4 or 3 multiple hits), 3 (2 multiple hits), 255 (single hit) Parameters ---------- bam : str Input BAM file with mapped reads. sites_single : str Output BED6 file to store data from single mapped reads. sites_multi : str Output BED6 file to store data from single and multi-mapped reads. skipped : str Output BAM file to store reads that do not map as expected by segmentation and reference genome sequence. If read's second start does not fall on any of segmentation borders, it is considered problematic. If segmentation is not provided, every read in two parts with gap longer than gap_th is not used (skipped). All such reads are reported to the user for further exploration. group_by : str Assign score of a read to either 'start', 'middle' or 'end' nucleotide. quant : str Report number of 'cDNA' or number of 'reads'. segmentation : str File with custon segmentation format (obtained by ``iCount segment``). mapq_th : int Ignore hits with MAPQ < mapq_th. multimax : int Ignore reads, mapped to more than ``multimax`` places. report_progress : bool Switch to report progress. gap_th : int Reads with gaps less than gap_th are treated as if they have no gap. Returns ------- iCount.Metrics Metrics object, storing analysis metadata. """ iCount.log_inputs(LOGGER, level=logging.INFO) # pylint: disable=protected-access assert sites_single.endswith(('.bed', '.bed.gz')) assert sites_multi.endswith(('.bed', '.bed.gz')) assert skipped.endswith(('.bam')) assert quant in ['cDNA', 'reads'] assert group_by in ['start', 'middle', 'end'] metrics = iCount.Metrics() single, multi = {}, {} progress = 0 for (chrom, strand), new_progress, by_pos in _processs_bam_file( bam, metrics, mapq_th, skipped, segmentation, gap_th): if report_progress: # pylint: disable=protected-access progress = iCount._log_progress(new_progress, progress, LOGGER) single_by_pos = {} multi_by_pos = {} for xlink_pos, by_bc in by_pos.items(): # count single mapped reads only _update(single_by_pos, _collapse(xlink_pos, by_bc, group_by, multimax=1)) # count all reads mapped les than multimax times _update(multi_by_pos, _collapse(xlink_pos, by_bc, group_by, multimax=multimax)) single.setdefault((chrom, strand), {}).update(single_by_pos) multi.setdefault((chrom, strand), {}).update(multi_by_pos) # Write output val_index = ['cDNA', 'reads'].index(quant) _save_dict(single, sites_single, val_index=val_index) LOGGER.info('Saved to BED file (single mapped reads): %s', sites_single) _save_dict(multi, sites_multi, val_index=val_index) LOGGER.info('Saved to BED file (multi-mapped reads): %s', sites_multi) return metrics
def run(bam, segmentation, out_file, strange, cross_transcript, implicit_handling='closest', mismatches=2, mapq_th=0, holesize_th=4, max_barcodes=10000): """ Compute distribution of cross-links relative to genomic landmarks. Parameters ---------- bam : str BAM file with alligned reads. segmentation : str GTF file with segmentation. Should be a file produced by function `get_segments`. out_file : str Output file with analysis results. strange : str File with strange propertieas obtained when processing bam file. cross_transcript : str File with reads spanning over multiple transcripts or multiple genes. implicit_handling : str Can be 'closest' or 'split'. In case of implicit read - split score to both neighbours or give it just to the closest neighbour. mismatches : int Reads on same position with random barcode differing less than ``mismatches`` are grouped together. mapq_th : int Ignore hits with MAPQ < mapq_th. holesize_th : int Raeads with size of holes less than holesize_th are treted as if they would have no holes. max_barcodes : int Skip merging similar barcodes if number of distinct barcodes at position is higher that this. Returns ------- str File with number of (al, explicit) scores per each position in each RNA-map type. """ iCount.logger.log_inputs(LOGGER) if implicit_handling not in ('closest', 'split'): raise ValueError( 'Parameter implicit_handling should be one of "closest" or "split"' ) metrics = iCount.Metrics() metrics.cross_transcript = 0 metrics.origin_premrna = 0 metrics.origin_mrna = 0 metrics.origin_ambiguous = 0 # The root container: data = {} progress = 0 LOGGER.info('Processing data...') # pylint: disable=protected-access for (chrom, strand ), new_progress, by_pos in iCount.mapping.xlsites._processs_bam_file( bam, metrics, mapq_th, strange, segmentation=segmentation, gap_th=holesize_th): # pylint: disable=protected-access progress = iCount._log_progress(new_progress, progress, LOGGER) # Sort all genes (and intergenic) by start coordinate. segmentation_sorted = sorted( iCount.genomes.segment._prepare_segmentation( segmentation, chrom, strand).items(), key=lambda x: x[1]['gene_segment'].start) seg_max_index = len(segmentation_sorted) - 1 start_gene_index, stop_gene_index = 0, seg_max_index for xlink_pos, by_bc in sorted(by_pos.items()): # pylint: disable=protected-access iCount.mapping.xlsites._merge_similar_randomers( by_bc, mismatches, max_barcodes) # by_bc is modified in place in _merge_similar_randomers # reads is a list of reads belonging to given barcode in by_bc for reads in by_bc.values(): ss_groups = {} for read in reads: # Define second start groups: ss_groups.setdefault(read[4], []).append(read) # Process each second start group: for ss_group in ss_groups.values(): # The following block extracts just the required genes (& gene_content) # without iterating through all genes/content in chromosome. # Sort reads by length and take the longest one (read_len is 3rd column)! ss_group = sorted(ss_group, key=lambda x: (-x[2])) stop = ss_group[0][1] start = xlink_pos segmentation_subset = [] passed_start = False # Weather the start_gene_index was aready found. for gene_item in segmentation_sorted[start_gene_index:]: gene_segment = gene_item[1]['gene_segment'] if gene_segment.start <= start <= gene_segment.stop: start_gene_index = segmentation_sorted.index( gene_item) passed_start = True if passed_start: segmentation_subset.append(gene_item) if gene_segment.start <= stop <= gene_segment.stop: stop_gene_index = segmentation_sorted.index( gene_item) # Append also one gene before (insert on first position to keep sorted) segmentation_subset.insert( 0, segmentation_sorted[max( start_gene_index - 1, 0)]) # Append also one gene after: segmentation_subset.append(segmentation_sorted[min( stop_gene_index + 1, seg_max_index)]) break # Even if entries repeat, this is still OK, sice # first and last gene neeed to be the ones not including # start/stop! # segmentation_subset is defined. Now process this group: _process_read_group(xlink_pos, chrom, strand, ss_group[0], data, segmentation_subset, metrics, implicit_handling=implicit_handling) LOGGER.info('Writing output files...') header = ['RNAmap type', 'position', 'all', 'explicit'] cross_tr_header = [ 'chrom', 'strand', 'xlink', 'second-start', 'end-position', 'read_len' ] with open(out_file, 'wt') as ofile, open(cross_transcript, 'wt') as ctfile: ofile.write('\t'.join(header) + '\n') ctfile.write('\t'.join(cross_tr_header) + '\n') for rna_map_type, positions in sorted(data.items()): if rna_map_type == 'cross_transcript': for (chrom, strand, xlink), read_list in positions.items(): for (_, end, read_len, _, second_start) in read_list: ctfile.write('\t'.join( map(str, [ chrom, strand, xlink, second_start, end, read_len ])) + '\n') else: for position, [all_, explic] in sorted(positions.items()): # Round to 4 decimal places with _f2s function: all_, explic = _f2s(all_, dec=4), _f2s(explic, dec=4) ofile.write( '\t'.join([rna_map_type, str(position), all_, explic]) + '\n') LOGGER.info('RNA-maps output written to: %s', out_file) LOGGER.info('Reads spanning multiple transcripts written to: %s', cross_transcript) LOGGER.info('Done.') return metrics
def run(reads, adapter, barcodes, mismatches=1, minimum_length=15, prefix='demux', out_dir='.'): """ Demultiplex FASTQ file. Split input FASTQ file into separate files, one for each barcode, and additional file for non-matching barcodes. Parameters ---------- reads : str Path to reads from a sequencing library. adapter : str Adapter sequence to remove from ends of reads. barcodes : list_str List of barcodes used for library. mismatches : int Number of tolerated mismatches when comparing barcodes. minimum_length : int Minimum length of trimmed sequence to keep. prefix : str Prefix of generated FASTQ files. out_dir : str Output folder. Use local folder if none given. Returns ------- str List of filenames where separated reads are stored. """ iCount.log_inputs(LOGGER, level=logging.INFO) metrics = iCount.Metrics() metrics.reads_ok = 0 metrics.reads_fail = 0 if not os.path.isdir(out_dir): raise FileNotFoundError( 'Output directory does not exist. Make sure it does.') out_fnames = [ os.path.abspath( os.path.join(out_dir, '{}_{}_tmp.fastq.gz'.format(prefix, barcode))) for barcode in barcodes + ['nomatch'] ] LOGGER.info('Demultiplexing...') # Make list of file handles - one for each output filename: out_fastqs = [ iCount.files.fastq.FastqFile(fname, 'wt') for fname in out_fnames ] # Determine experiment ID and random barcode for each fastq entry: kwargs = {'mismatches': mismatches, 'minimum_length': minimum_length} for fq_entry, exp_id, randomer in _extract(reads, barcodes, **kwargs): if randomer: metrics.reads_ok += 1 if fq_entry.id[-2] == '/': # For early versions of Illumina, keep mate info at the end: r_pair = fq_entry.id[-2:] fq_entry.id = '{}:rbc:{}{}'.format(fq_entry.id[:-2], randomer, r_pair) else: fq_entry.id = '{}:rbc:{}'.format(fq_entry.id, randomer) else: metrics.reads_fail += 1 out_fastqs[exp_id].write(fq_entry) for out_fastq in out_fastqs: out_fastq.close() # Finally, remove adapters (if requested) or just rename to final names: out_fnames_final = [ '{}.fastq.gz'.format(fname[:-13]) for fname in out_fnames ] for fname_in, fname_out in zip(out_fnames, out_fnames_final): if adapter and 'nomatch' not in fname_in: remove_adapter(fname_in, fname_out, adapter, minimum_length=minimum_length) os.remove(fname_in) else: shutil.move(fname_in, fname_out) return metrics
def summary_reports(annotation, sites, out_dir, templates_dir=None): """ Make summary reports for a cross-link file. Parameters ---------- annotation : str Annotation file (GTF format). It is recommended to use genome-level segmentation (e.g. regions.gtf.gz), that is produced by ``iCount segment`` command. sites : str Croslinks file (BED6 format). Should be sorted by coordinate. out_dir : str Output directory. templates_dir : str Directory containing templates for summary calculation. Made by ``iCount segment`` command. If this argument is not provided, summary templates are made on the fly. Returns ------- iCount.Metrics iCount Metrics object. """ iCount.log_inputs(LOGGER, level=logging.INFO) metrics = iCount.Metrics() if templates_dir is None: templates_dir = tempfile.mkdtemp() summary_templates(annotation, templates_dir) LOGGER.info( 'Calculating intersection between cross-link and annotation...') # pylint: disable=too-many-function-args,unexpected-keyword-arg overlaps = BedTool(sites).intersect( BedTool(annotation), sorted=True, # invokes memory efficient algorithm for large files s=True, # only report hits in B that overlap A on the same strand wb=True, # write the original entry in B for each overlap nonamecheck= True, # Do not print warnings about name inconsistency to stdout ).saveas() # pylint: enable=too-many-function-args,unexpected-keyword-arg try: overlaps[0] # will raise Error if overlaps is empty: except (IndexError, TypeError): raise ValueError( 'No intersections found. This may be caused by different naming of chromosomes in annotation' 'and cross-links file (example: "chr1" vs. "1")') type_counter, subtype_counter, gene_counter = {}, {}, {} LOGGER.info('Extracting summary data from intersection...') for segment in overlaps: score = int(segment.score) type_ = segment[8] type_counter[type_] = type_counter.get(type_, 0) + score biotype = re.match(r'.*biotype "(.*?)";', segment[-1]) biotype = biotype.group(1) if biotype else '' biotypes = biotype.split(',') for biotype in biotypes: sbtyp = iCount.genomes.region.make_subtype(type_, biotype) subtype_counter[sbtyp] = subtype_counter.get( sbtyp, 0) + score / len(biotypes) gene_id = re.match(r'.*gene_id "(.*?)";', segment[-1]) gene_id = gene_id.group(1) if gene_id else None gene_counter[gene_id] = gene_counter.get(gene_id, 0) + score sum_cdna = 0 for seg in BedTool(sites): sum_cdna += int(seg.score) def parse_template(template_file): """Parse template file.""" template = {} with open(template_file, 'rt') as ifile: for line in ifile: line = line.strip().split('\t') template[line[0]] = line[1:] return template LOGGER.info('Writing type report...') type_template = parse_template(os.path.join(templates_dir, TEMPLATE_TYPE)) with open(os.path.join(out_dir, SUMMARY_TYPE), 'wt') as out: header = ['Type', 'Length', 'cDNA #', 'cDNA %'] out.write('\t'.join(header) + '\n') for type_, cdna in sorted(type_counter.items(), key=lambda x: sort_types_subtypes(x[0])): line = [ type_, type_template.get(type_, [-1])[0], math.floor(cdna), cdna / sum_cdna * 100 ] out.write('\t'.join(map(str, line)) + '\n') LOGGER.info('Writing subtype report...') subtype_template = parse_template( os.path.join(templates_dir, TEMPLATE_SUBTYPE)) with open(os.path.join(out_dir, SUMMARY_SUBTYPE), 'wt') as out: header = ['Subtype', 'Length', 'cDNA #', 'cDNA %'] out.write('\t'.join(header) + '\n') for stype, cdna in sorted(subtype_counter.items(), key=lambda x: sort_types_subtypes(x[0])): line = [ stype, subtype_template.get(stype, [-1])[0], math.floor(cdna), cdna / sum_cdna * 100 ] out.write('\t'.join(map(str, line)) + '\n') LOGGER.info('Writing gene report...') gene_template = parse_template(os.path.join(templates_dir, TEMPLATE_GENE)) with open(os.path.join(out_dir, SUMMARY_GENE), 'wt') as out: header = ['Gene name (Gene ID)', 'Length', 'cDNA #', 'cDNA %'] out.write('\t'.join(header) + '\n') for gene_id, cdna in sorted(gene_counter.items()): gene_name, length = gene_template.get(gene_id, ['', -1]) if gene_id == '.': gene_name = 'intergenic' line = [ '{} ({})'.format(gene_name, gene_id), length, math.floor(cdna), cdna / sum_cdna * 100 ] out.write('\t'.join(map(str, line)) + '\n') LOGGER.info('Done.') return metrics
def run(annotation, sites, sigxls, scores=None, features=None, group_by='gene_id', merge_features=False, half_window=3, fdr=0.05, perms=100, rnd_seed=42, report_progress=False): """ Find positions with high density of cross-linked sites. When determining feature.name, value of the first existing attribute in the following tuple is taken:: ("ID", "gene_name", "transcript_id", "gene_id", "Parent") Source in pybedtools: https://github.com/daler/pybedtools/blob/master/pybedtools/scripts/annotate.py#L34 Parameters ---------- annotation : str Annotation file in GTF format, obtained from "iCount segment" command. sites : str File with cross-links in BED6 format. sigxls : str File name for "sigxls" output. File reports positions with significant number of cross-link events. It should have .bed or .bed.gz extension. scores : str File name for "scores" output. File reports all cross-link events, independent from their FDR score It should have .tsv, .csv, .txt or .gz extension. features : list_str Features from annotation to consider. If None, ['gene'] is used. Sometimes, it is advised to use ['gene', 'intergenic']. group_by : str Attribute by which cross-link positions are grouped. merge_features : bool Treat all features as one when grouping. Has no effect when only one feature is given in features parameter. half_window : int Half-window size. fdr : float FDR threshold. perms : int Number of permutations when calculating random distribution. rnd_seed : int Seed for random generator. report_progress : bool Report analysis progress. Returns ------- iCount.metrics Analysis metadata. """ iCount.log_inputs(LOGGER, level=logging.INFO) metrics = iCount.Metrics() if features is None: features = ['gene'] assert sigxls.endswith(('.bed', '.bed.gz')) if scores: assert scores.endswith( ('.tsv', '.tsv.gz', '.csv', '.csv.gz', 'txt', 'txt.gz')) numpy.random.seed(rnd_seed) # pylint: disable=no-member LOGGER.info('Loading annotation file...') annotation2 = iCount.files.decompress_to_tempfile(annotation) if annotation2 != annotation: to_delete_temp = annotation2 annotation = annotation2 else: to_delete_temp = None annotation = pybedtools.BedTool(annotation).saveas() metrics.annotation_all = len(annotation) annotation = annotation.filter(lambda x: x[2] in features).sort().saveas() metrics.annotation_used = len(annotation) metrics.annotation_skipped = metrics.annotation_all - metrics.annotation_used LOGGER.info('%d out of %d annotation records will be used (%d skipped).', metrics.annotation_used, metrics.annotation_all, metrics.annotation_skipped) LOGGER.info('Loading cross-links file...') sites = pybedtools.BedTool(sites).sort().saveas() # intersect cross-linked sites with regions LOGGER.info( 'Calculating intersection between annotation and cross-link file...') overlaps = annotation.intersect(sites, sorted=True, s=True, wo=True).saveas() groups = {} group_sizes = {} multi_mode = len(features) > 1 and not merge_features LOGGER.info('Processing intersections...') for feature in overlaps: chrom = feature.chrom start = feature.start end = feature.stop name = feature.name strand = feature.strand site_chrom = feature.fields[9] site_pos = int(feature.fields[10]) site_end = int(feature.fields[11]) site_dot = feature.fields[12] site_score = float(feature.fields[13]) site_strand = feature.fields[14] assert site_chrom == chrom assert site_strand == strand assert site_dot == '.' assert site_pos == site_end - 1 # Determine group_id depending on multi_mode... group_id = feature.attrs[group_by] if multi_mode: group_id = feature[2] + '_' + group_id groups.setdefault((chrom, strand, group_id, name), []).append( (site_pos, site_score)) group_sizes.setdefault((chrom, strand, group_id, name), set()).add( (start, end)) # Validate that segments in same group do not overlap: start of next feature # is greater than stop of the current one: for sizes in group_sizes.values(): sizes = sorted(sizes) for first, second in zip(sizes, sizes[1:]): assert first[1] < second[0] # calculate total length of each group by summing element sizes: group_sizes = dict([(name, sum([end - start for start, end in elements])) for name, elements in group_sizes.items()]) # calculate and assign FDRs to each cross-linked site. FDR values are # calculated together for each group. results = {} metrics.all_groups = len(groups) progress, j = 0, 0 for (chrom, strand, group_id, name), hits in sorted(groups.items()): j += 1 if report_progress: new_progress = j / metrics.all_groups # pylint: disable=protected-access progress = iCount._log_progress(new_progress, progress, LOGGER) group_size = group_sizes[(chrom, strand, group_id, name)] # Crucial step: each position in a group is given a fdr_score, based on # hits in group, group_size, half-window size and number of # permutations. Than, FDR scores (+ some other info) are written to # `results` container: processed = _process_group(hits, group_size, half_window, perms) for (pos, val, val_extended, fdr_score) in processed: results.setdefault((chrom, pos, strand), []).\ append((fdr_score, name, group_id, val, val_extended)) metrics.positions_annotated = len(results) # cross-linked sites outside annotated regions LOGGER.info('Determining cross-links not intersecting with annotation...') skipped = sites.intersect(annotation, sorted=True, s=True, v=True).saveas() for feature in skipped: site_chrom = feature.chrom site_start = feature.start site_end = feature.stop # site_name = feature.name site_score = feature.score site_strand = feature.strand assert site_start == site_end - 1 k = (site_chrom, site_start, site_strand) assert k not in results results.setdefault(k, []).\ append((1.0, 'not_annotated', 'not_annotated', site_score, 'not_calculated')) metrics.positions_all = len(results) metrics.positions_not_annotated = metrics.positions_all - metrics.positions_annotated LOGGER.info( 'Significant crosslinks calculation finished. Writing results to files...' ) # Make sigxls: a BED6 file, with only the most significant cross-links: metrics.significant_positions = 0 with iCount.files.gz_open(sigxls, 'wt') as sigxls: for (chrom, pos, strand), annot_list in sorted(results.items()): annot_list = sorted(annot_list) # report minimum fdr_score for each position in BED6 min_fdr_score = annot_list[0][0] if min_fdr_score < fdr: metrics.significant_positions += 1 # position has significant records - report the most significant ones: min_fdr_records = [ rec for rec in annot_list if rec[0] == min_fdr_score ] _, names, group_ids, group_scores, _ = zip(*min_fdr_records) if names == group_ids: name = ','.join(names) else: name = ','.join(names) + '-' + ','.join(group_ids) line = [chrom, pos, pos + 1, name, group_scores[0], strand] sigxls.write('\t'.join([_f2s(i, dec=4) for i in line]) + '\n') LOGGER.info('BED6 file with significant crosslinks saved to: %s', sigxls.name) # Make scores: a tab-separated file, with ALL cross-links, (no significance threshold) header = [ 'chrom', 'position', 'strand', 'name', 'group_id', 'score', 'score_extended', 'FDR' ] if scores: with iCount.files.gz_open(scores, 'wt') as scores: scores.write('\t'.join(header) + '\n') for (chrom, pos, strand), annot_list in sorted(results.items()): for (fdr_score, name, group_id, score, val_extended) in sorted(annot_list): line = [ chrom, pos, strand, name, group_id, score, val_extended, fdr_score ] scores.write('\t'.join([_f2s(i, dec=6) for i in line]) + '\n') LOGGER.info('Scores for each cross-linked position saved to: %s', scores.name) if to_delete_temp: os.remove(to_delete_temp) LOGGER.info('Done.') return metrics
def run(reads, adapter, barcodes5, barcodes3=None, mismatches=1, minimum_length=15, min_adapter_overlap=7, prefix='demux', out_dir='.'): """ Demultiplex FASTQ file. Split input FASTQ file into separate files, one for each barcode, and additional file for non-matching barcodes. Write random barcode of a read into it's FASTQ header row. Parameters ---------- reads : str Sequencing reads. adapter : str Adapter sequence to remove from 3-prime end of reads. barcodes5 : list_str List of 5-prime end barcodes. barcodes3 : list_str List of 3-prime end barcodes. mismatches : int Number of tolerated mismatches when comparing barcodes. minimum_length : int Minimum length of trimmed sequence to keep. min_adapter_overlap : int Minimum length of adapter on 3' end if demultiplexing also on 3' barcodes. prefix : str Prefix of generated FASTQ files. out_dir : str Output folder. Use current folder if none is given. Returns ------- iCount.Metrics Metrics object, storing analysis metadata. """ iCount.log_inputs(LOGGER, level=logging.INFO) metrics = iCount.Metrics() metrics.reads_ok = 0 metrics.reads_fail = 0 kwargs = { 'mismatches': mismatches, 'minimum_length': minimum_length, 'prefix': prefix, 'out_dir': out_dir, 'metrics': metrics, } if not os.path.isdir(out_dir): raise FileNotFoundError( 'Output directory does not exist. Make sure it does.') # This should be a dict, where each 5' barcode has al list of it's 3' barcods barcodes = prepare_barcodes(barcodes5, barcodes3) LOGGER.info("Demultiplexing based on 5' barcodes...") # Demultiplex by 5' barcodes only. Store in files. Just like before. demultiplex(reads=reads, barcodes=barcodes, **kwargs) os.rename( os.path.join(out_dir, 'demux_nomatch.fastq.gz'), os.path.join(out_dir, 'demux_nomatch5.fastq.gz'), ) LOGGER.info("Demultiplexing based on 3' barcodes...") for barcode5 in barcodes: reads5 = os.path.join(out_dir, 'demux_{}.fastq.gz'.format(barcode5)) barcodes3 = barcodes[barcode5]['barcodes3'] if not barcodes3: # This barcode has no 3' counterparts. Just remove the adapter and continue # TODO: polish the parameters for adapter removal in this case... remove_adapter(reads5, adapter, overwrite=True) continue # One must be sure that there actually are 3' barcodes on the # 3' end. In cca. 20-30% of cases read is so short that it does # not reach the 3' barcode. In such cases, demultiplexing by 3' # end would be done by random chance which is not acceptable not # good. To be sure that 3' barcode is reached, read needs to # contain at least ``adapter_overlap`` bp of the adapter. no_adapters = os.path.join( out_dir, "no_adapter_found_{}.fastq.gz".format(barcode5)) remove_adapter(reads5, adapter, overwrite=True, overlap=min_adapter_overlap, untrimmed_output=no_adapters) # Fix the prefix, to include 5' barcode info: kwargs['prefix'] = '{}_{}'.format(prefix, barcode5) kwargs['mismatches'] = 0 # Now, demutiplex based on 3' adapter demultiplex(reads=reads5, barcodes=barcodes3, **kwargs) # File that is demultiplexed only by 5'end is not needed anymore os.remove(reads5) # TODO: merge nomatch files 3' and 5' end... This may be many! return metrics
def make_summary_report(annotation, sites, summary, fai, types_length_file=None, digits='8', subtype=None, excluded_types=None): """ Make summary report from cross-link and annotation data. In the context of this report "type" equals to the combination of 3rd column and attribute `subtype` from annotation file (GTF). "Regions" are parts of transcript (UTR, CDS, introns...) that "non- intersectingly" span the whole transcript. Intergenic regions are also considered as region. Each region has one and only one type. For each of such types, number of cross-link events/sites is counted. Sites = sum of *all* cross-link sites that are in regions of some type. Events = sum of col5 for *all* cross-link sites that are in regions of some type (multiple events can happen on each cross link). Col5 is numerical value in 5th column of cross-link file. Parameters ---------- annotation : str Path to annotation GTF file (should include subtype attribute). sites : str Path to BED6 file listing cross-linked sites. summary : str Path to output tab-delimited file with summary statistics. fai : str Path to file with chromosome lengths. types_length_file : str Path to file with lengths of each type. digits : int Number of decimal places in results. subtype : str Name of attribute to be used as subtype. excluded_types : list_str Types listed in 3rd column of GTF to be exclude from analysis. Returns ------- str Path to summary report file (equal to parameter out_file) """ iCount.log_inputs(LOGGER, level=logging.INFO) metrics = iCount.Metrics() # If not given/present, make file with cumulative length for each type: if not types_length_file or not os.path.isfile(types_length_file): LOGGER.info('types_length_file not given - calculating it') types_length_file, annotation = make_types_length_file( annotation, fai, subtype=subtype, excluded_types=excluded_types) # read the file to dict, named type_lengths type_lengths = {} with open(types_length_file) as tfile: for line in tfile: parts = line.strip().split() type_lengths[' '.join(parts[:-1])] = int(parts[-1]) # sorted=True - invokes memory efficient algorithm for large files # s=True - only report hits in B that overlap A on the same strand # wb=True - Write the original entry in B for each overlap cross_links = pybedtools.BedTool(sites).sort().saveas() annotation = pybedtools.BedTool(annotation).sort().saveas() LOGGER.info( 'Calculating intersection between cross-link and annotation...') overlaps = cross_links.intersect(annotation, sorted=True, s=True, wb=True).saveas() try: # this will raise TypeError if overlaps is empty: overlaps[0] except (IndexError, TypeError): raise ValueError('No intersections found. This may be caused by ' 'different naming of chromosomes in annotation and ' 'cross-links file (example: "chr1" vs. "1")') # dict structure = type_: [# of sites, # of events] type_counter = {type_: [0, 0] for type_ in type_lengths} site_types = [] previous_segment = overlaps[0] def finalize(types, segment): """Increase counter for all types that intersect with segment.""" for type_ in set(types): type_counter[type_][0] += 1 # sites type_counter[type_][1] += int(segment[4]) # events LOGGER.info('Extracting summary from data...') for segment in overlaps: # detect if segment contains new cross-link site: if segment.start != previous_segment.start or segment.strand != previous_segment.strand: finalize(site_types, previous_segment) site_types = [] if subtype: # Extract subtype attribute: stype = re.match(r'.*{} "(.*)";'.format(subtype), segment[-1]) stype = stype.group(1) if stype else None site_types.append( ' '.join([segment[8], stype] if stype else [segment[8]])) else: site_types.append(segment[8]) previous_segment = segment finalize(site_types, previous_segment) # Produce report file: header = [ 'type', 'length', 'length %', 'sites #', 'sites %', 'sites enrichment', 'events #', 'events %', 'events enrichment' ] sum_sites = sum([i[0] for i in type_counter.values()]) sum_events = sum([i[1] for i in type_counter.values()]) # total genome len = sum_of_all_chrom_length * 2 (there are + and - strand): total_length = sum([int(line.strip().split()[1]) for line in open(fai)]) * 2 with open(summary, 'wt') as out: out.write('\t'.join(header) + '\n') for type_, [sites, events] in sorted(type_counter.items()): length_percent = type_lengths[type_] / total_length site_percent = sites / sum_sites site_enrichment = site_percent / length_percent event_percent = events / sum_events event_enrichment = event_percent / length_percent line = [ type_, type_lengths[type_], length_percent, sites, site_percent, site_enrichment, events, event_percent, event_enrichment ] line = line[:1] + [round(i, int(digits)) for i in line[1:]] out.write('\t'.join(map(str, line)) + '\n') LOGGER.info('Done. Results saved to: %s', os.path.abspath(summary)) return metrics
def annotate_cross_links(annotation, sites, sites_annotated, subtype='biotype', excluded_types=None): """ Annotate each cross-link site with all region types that intersect it. Each cross link can overlap/intersect with many intervals in annotation file. Each of these intervals has a given type. Make a copy of cross link file and include all these types in 4th column. In the context of this report "type" equals to the combination of 3rd column and attribute `subtype` from annotation file (GTF). Regions/intervals are parts of transcript (UTR, CDS, introns...) that "non- intersectingly" span the whole transcript. However, since transcripts can overlap, also intervals belonging to different transcripts can overlap. Intergenic regions are also considered as region. Each region has one and only one type. Parameters ---------- annotation : str Path to annotation file (should be GTF and include `subtype` attribute). sites : str Path to input BED6 file listing all cross-linked sites. sites_annotated : str Path to output BED6 file listing annotated cross-linked sites. subtype : str Subtype. excluded_types : list_str Excluded types. Returns ------- str Path to summary report file (should be equal to out_file parameter) """ iCount.log_inputs(LOGGER, level=logging.INFO) metrics = iCount.Metrics() excluded_types = excluded_types or [] cross_links = pybedtools.BedTool(sites).sort().saveas() annotation = pybedtools.BedTool(annotation).filter( lambda x: x[2] not in excluded_types).sort().saveas() LOGGER.info( 'Calculating overlaps between cross-link and annotation_file...') overlaps = cross_links.intersect(annotation, sorted=True, s=True, wb=True).saveas() try: # this will raise TypeError if overlaps is empty: overlaps[0] except (IndexError, TypeError): raise ValueError('No intersections found. This may be caused by ' 'different naming of chromosomes in annotation and' 'cross_links file ("chr1" vs. "1")') data = [] # cotainer for final annotated BED file intervals site_types = [ ] # cotainer for all types intersecting with given cross-link previous_interval = overlaps[0] def finalize(types, site): """Make annotated (with all intersecting types) cross link interval.""" data.append( create_interval_from_list( site[0:3] + ['; '.join(map(str, sorted(set(types))))] + site[4:6])) for interval in overlaps: # Detect new cross link: if interval.start != previous_interval.start or \ interval.strand != previous_interval.strand: finalize(site_types, previous_interval) site_types = [] if subtype: # Extract subtype attribute: stype = re.match(r'.*{} "(.*?)";'.format(subtype), interval[-1]) site_types.append('{} {}'.format(interval[8], stype.group(1) if stype else '.')) else: site_types.append(interval[8]) previous_interval = interval finalize(site_types, previous_interval) # Produce annotated cross-link file: LOGGER.info('Writing results to file...') # To save with .gz compression, file has to first be saved to tmp_dir and # saved to filename with .gz extension later. Otherwise, file has gzip # extension, but it is not actually gzipped. tmp_ann = pybedtools.BedTool(line for line in data).saveas() tmp_ann.saveas(sites_annotated) LOGGER.info('Done. Output saved to: %s', os.path.abspath(sites_annotated)) return metrics
def get_segments(annotation, segmentation, fai, report_progress=False): """ Create GTF file with transcript level segmentation. Each line in this file should define one of the following elements: * gene * transcript * CDS * UTR3 * UTR5 * intron * ncRNA * intergenic Name of third field (interval.fields[2]) should correspond to one of theese names. Only consider GTF entries of chromosomes given in fai file. Parameters ---------- annotation : str Path to input GTF file. segmentation : str Path to output GTF file. fai : str Path to input genome_file (.fai or similar). report_progress : bool Show progress. Returns ------- str Absolute path to output GTF file. """ iCount.log_inputs(LOGGER, level=logging.INFO) metrics = iCount.Metrics() metrics.genes = 0 # Container for storing intermediate data data = [] LOGGER.debug('Opening genome file: %s', fai) fai = _first_two_columns(fai) with open(fai) as gfile: chromosomes = [line.strip().split()[0] for line in gfile] def process_gene(gene_content): """ Process each group of intervals belonging to gene. Process each transcript_group in gene_content, add 'biotype' attribute to all intervals and include them in `data`. """ assert 'gene' in gene_content for id_, transcript_group in gene_content.items(): if id_ == 'gene': continue gene_content[id_] = _process_transcript_group(transcript_group) # Add biotype attribute to all intervals: gene_content = _add_biotype_attribute(gene_content) for id_, transcript_group in gene_content.items(): if id_ == 'gene': continue data.extend(transcript_group) data.append(gene_content['gene']) LOGGER.debug('Processing genome annotation from: %s', annotation) for gene_content in _get_gene_content(annotation, chromosomes, report_progress): process_gene(gene_content) LOGGER.debug('Just processed gene: %s', gene_content['gene'].attrs['gene_id']) metrics.genes += 1 # Produce GTF/GFF file from data: gtf = BedTool(i.fields for i in data).saveas() LOGGER.info('Calculating intergenic intervals...') intergenic_pos = _complement(gtf.fn, fai, '+') intergenic_neg = _complement(gtf.fn, fai, '-') # Join the gtf, intergenic_pos and intergenic_neg in one file: file2 = tempfile.NamedTemporaryFile(delete=False) for infile in [gtf.fn, intergenic_pos, intergenic_neg]: shutil.copyfileobj(open(infile, 'rb'), file2) file2.close() file3 = BedTool(file2.name).sort().saveas(segmentation) LOGGER.info('Segmentation stored in %s', file3.fn) LOGGER.info('Making also gene level segmentation...') make_regions(segmentation, out_dir=os.path.dirname(os.path.abspath(segmentation))) return metrics
def get_regions(annotation, segmentation, fai, report_progress=False): """ Create new gtf file with custom annotation and filtered content. Each line in new file should define one of the following elements: * gene * transcript * CDS * intron * UTR3 * UTR5 * stop_codon * ncRNA * intergenic Name of third field (interval.fields[2]) should correspond to one of theese names. Only consider GTF entries of chromosomes given in genome_file. Parameters ---------- annotation : str Path to input GTF file. segmentation : str Path to output GTF file. fai : str Path to input genome_file (.fai or similar). report_progress : bool Switch to show progress. Returns ------- str Absolute path to output GTF file. """ iCount.log_inputs(LOGGER, level=logging.INFO) metrics = iCount.Metrics() metrics.genes = 0 # Container for storing intermediate data data = [] LOGGER.debug('Opening genome file: %s', fai) # Keep just first two column or _complement will fail... fai = _first_two_columns(fai) with open(fai) as gfile: chromosomes = [line.strip().split()[0] for line in gfile] def process_gene(gene_content): """ Process each group of intervals belonging to gene. Process each transcript_group in gene_content, add 'biotype' attribute to all intervals and include them in `data`. """ assert 'gene' in gene_content for id_, transcript_group in gene_content.items(): if id_ == 'gene': continue gene_content[id_] = _process_transcript_group(transcript_group) # Add biotype attribute to all intervals: gene_content = _add_biotype_attribute(gene_content) for id_, transcript_group in gene_content.items(): if id_ == 'gene': continue data.extend(transcript_group) data.append(gene_content['gene']) LOGGER.debug('Processing genome annotation from: %s', annotation) for gene_content in _get_gene_content(annotation, chromosomes, report_progress): process_gene(gene_content) LOGGER.debug('Just processed gene: %s', gene_content['gene'].attrs['gene_id']) metrics.genes += 1 # This can be replaced with: multiprocessing.Pool, but it causes huge # memory usage. Possible explanation and solution: # http://stackoverflow.com/questions/21485319/high-memory-usage-using-python-multiprocessing # TODO: check and fix execution in parallel, look example # https://daler.github.io/pybedtools/3-brief-examples.html#example-3-count-reads-in-introns-and-exons-in-parallel # p = multiprocessing.Pool(threads, maxtasksperchild=100) # p.map(process_gene, _get_gene_content(gtf_in, chromosomes)) # Produce GTF/GFF file from data: gtf = pybedtools.BedTool(i.fields for i in data).saveas() LOGGER.info('Calculating intergenic regions...') intergenic_pos = _complement(gtf.fn, fai, '+') intergenic_neg = _complement(gtf.fn, fai, '-') # Join the gtf, intergenic_pos and intergenic_neg in one file: file2 = tempfile.NamedTemporaryFile(delete=False) for infile in [gtf.fn, intergenic_pos, intergenic_neg]: shutil.copyfileobj(open(infile, 'rb'), file2) file2.close() file3 = pybedtools.BedTool(file2.name).sort().saveas(segmentation) LOGGER.info('Segmentation stored in %s', file3.fn) return metrics