def main(dat, gzipped, flags, flag_filter, min_quality, bin_size, file=stdout, **kwargs): """Dispatch data to subroutines""" samfilters = [flags, flag_filter, min_quality] kmerscans = [ load_kmerscan(fn, gzipped, samfilters, bin_size) for fn in dat ] entropies = concat( calculate_entropies(bdf) for bdf in progressbar( chain(*(ks.values() for ks in kmerscans)), desc="Calculating entropies", unit="arm", total=sum(len(ks) for ks in kmerscans), )) quantiles = { q: weighted_quantile( entropies["#entropy"], entropies["coverage"] - 1, q / 100, ) for q in progressbar(range(5, 101, 5), desc="Calculating quantiles") } print("#" + ",".join(f"q{k}={v}" for k, v in quantiles.items()), file=file) entropies.to_csv(file, sep="\t", index=False)
def filter_and_read_tsv(dat, gzipped, integer_samfilters): """If filters supplied, subset DAT first, then read with pandas""" number_retained = 0 if gzipped: opener = gzopen else: opener = open with opener(dat, mode="rt") as dat_handle: with TemporaryDirectory() as tempdir: datflt_name = path.join(tempdir, "dat.gz") with gzopen(datflt_name, mode="wt") as datflt: decorated_line_iterator = progressbar( dat_handle, desc="Filtering", unit=" lines", ) for line in decorated_line_iterator: if line[0] == "#": print(line, end="", file=datflt) else: fields = line.split("\t") line_passes_filter = entry_filters_ok( int(fields[1]), int(fields[4]), integer_samfilters, ) if line_passes_filter: number_retained += 1 print(line, end="", file=datflt) print("Kept {} records".format(number_retained), file=stderr) print("Loading DAT...", file=stderr, flush=True) return read_csv(datflt_name, sep="\t", escapechar="#")
def parse_bam_with_ambiguity(bam, ecxfd, max_read_length, min_map_overlap, targets, samfilters): """Parse BAM file, select overhanging reads, possibly mapping to more than one arm""" with AlignmentFile(bam) as bam_data: reflens = dict(zip(bam_data.references, bam_data.lengths)) bam_header_string = str(bam_data.header).rstrip("\n") decorated_bam_iterator = progressbar( ecxfd, total=len(ecxfd), desc="Pulling", unit="chromosome", ) entries = [] for chrom in decorated_bam_iterator: bam_chunk = get_bam_chunk( bam_data, chrom, ecxfd, reflens, max_read_length, ) entries.extend( filter_entries( bam_chunk, ecxfd, targets, samfilters, min_map_overlap, ), ) return bam_header_string, entries
def filter_bam(alignment, samfilters, desc=None): """Wrap alignment iterator with a flag and quality filter""" integer_samfilters = list(map(interpret_flags, samfilters)) filtered_iterator = ( entry for entry in alignment if entry_filters_ok(entry.flag, entry.mapq, integer_samfilters)) if desc is None: return filtered_iterator else: return progressbar(filtered_iterator, desc=desc, unit="read")
def load_kmerscan(dat, gzipped, samfilters, bin_size=None, no_align=False, each_once=True): """Load densities from dat file, split into dataframes per chromosome""" integer_samfilters = list(map(interpret_flags, samfilters)) if not any(integer_samfilters): # all zero / None print("Loading DAT...", file=stderr, flush=True) raw_densities = read_csv(dat, sep="\t", escapechar="#") else: raw_densities = filter_and_read_tsv(dat, gzipped, integer_samfilters) if len(raw_densities) == 0: raise EmptyKmerscanError if not are_motifs_consistent(raw_densities): raise NotImplementedError(KMERSCANNER_INCONSISTENT_NUMBER_OF_MOTIFS) bin_size_data = raw_densities.columns[-1] raw_densities.rename(columns={bin_size_data: "density"}, inplace=True) if bin_size is None: bin_size_matcher = search(r'[0-9]+$', bin_size_data) if bin_size_matcher: bin_size = int(bin_size_matcher.group()) else: raise ValueError("No bin size in DAT, user must specify") if each_once: count_commas = lambda d: d.count(",") + 1 raw_densities["length"] = raw_densities["density"].apply(count_commas) groups = raw_densities[["name", "motif", "length"]].groupby( ["name", "motif"], as_index=False, ).max() raw_densities = merge(groups, raw_densities).drop(columns="length") if no_align: raw_densities["chrom"] = "None" chromosome_iterator = progressbar( raw_densities["chrom"].drop_duplicates(), desc="Interpreting data", unit="chromosome", ) return { chrom: get_binned_density_dataframe( raw_densities, chrom, bin_size, no_align, ) for chrom in chromosome_iterator }
def analyze_repeats(full_report, collapse_reverse_complement=False, adj="bonferroni"): """Analyze repeat enrichment for multiple lengths and apply multiple testing adjustment""" candidates = concat([ get_motifs_fisher( full_report[full_report["length"] == length], collapse_reverse_complement=collapse_reverse_complement, ) for length in progressbar( unique(full_report["length"].values), unit="k", desc="Calculating enrichment", ) ]) candidates["p_adjusted"] = multipletests(candidates["p"], method=adj)[1] return candidates[["motif", "length", "count", "p", "p_adjusted"]]
def make_decorated_densities_iterator(densities, chroms_to_plot=None): """Order chromosomes and wrap with progress bar""" if chroms_to_plot: sorted_chromosomes = natsorted_chromosomes( set(densities.keys()) | set(chroms_to_plot.split(","))) else: sorted_chromosomes = natsorted_chromosomes(densities.keys()) sorted_densities_iterator = ((chrom, densities.get(chrom)) for chrom in sorted_chromosomes) decorated_densities_iterator = progressbar( sorted_densities_iterator, total=len(sorted_chromosomes), desc="Plotting", unit="chromosome", ) return decorated_densities_iterator, len(sorted_chromosomes)
def calculate_chromosome_lds(chrom, entries, jobs): """Calculate pairwise relative levenshtein distances between all reads mapping to one chromosome""" with ThreadPoolExecutor(max_workers=jobs) as pool: workers = [ pool.submit( get_relative_read_ld, aname, bname, entries[aname], entries[bname], ) for aname, bname in combinations_with_replacement( sorted(entries.keys()), r=2, ) ] iterator = progressbar( as_completed(workers), desc=chrom, unit="pair", total=len(workers), ) for worker in iterator: yield worker.result()
def explain_report(filtered_analysis, sequencefile, min_repeats, jobs=1): """Calculate fraction of reads explainable by each motif""" explained_analysis = filtered_analysis.copy() explained_analysis["bases_explained"], total_bases = 0.0, 0 with FastxFile(sequencefile) as fastx: def get_number_of_masked_positions(sequence, motifs): n_masked_positions_per_motif = {} for motif in motifs: positions_to_mask = set() motifs_pattern = get_circular_pattern( motif, repeats=min_repeats, ) matcher = motifs_pattern.finditer(sequence, overlapped=True) for match in matcher: positions_to_mask |= set(range(match.start(), match.end())) n_masked_positions_per_motif[motif] = len(positions_to_mask) return n_masked_positions_per_motif, len(sequence) with ThreadPoolExecutor(max_workers=jobs) as pool: workers = [ pool.submit( get_number_of_masked_positions, entry.sequence, set(filtered_analysis["motif"]), ) for entry in fastx ] iterator = progressbar( as_completed(workers), total=len(workers), desc="Calculating fractions", unit="read", ) for worker in iterator: n_masked_positions_per_motif, total_seq_bases = worker.result() for motif, n_pos in n_masked_positions_per_motif.items(): indexer = ( explained_analysis["motif"] == motif, "bases_explained", ) explained_analysis.loc[indexer] += n_pos total_bases += total_seq_bases return explained_analysis, total_bases
def main(bam, index, flags, flag_filter, min_quality, target, file=stdout, **kwargs): """Interpret arguments and dispatch data to subroutines""" if target == "cigar": chopper, integer_target = cigar_chopper, None else: chopper, integer_target = relative_chopper, interpret_flags(target) ecx = load_index(index) with AlignmentFile(bam) as alignment: print(str(alignment.header).rstrip("\n"), file=file) n_skipped = 0 bam_iterator = progressbar( filter_bam(alignment, [flags, flag_filter, min_quality]), desc="Chopping", unit="read", ) with errstate(invalid="ignore"): for entry in bam_iterator: if entry.query_sequence: chopped_entry, error = chopper( entry, ecx, integer_target, ) if chopped_entry.query_sequence: print(chopped_entry.to_string(), file=file) else: n_skipped += 1 if n_skipped: msg_mask = "Skipped {} reads to be safe (unsure where to chop)" print(msg_mask.format(n_skipped), file=stderr) warning = [ "WARNING: Read mapping positions were adjusted and retained;", " this is needed to comply with the SAM spec.", " Do not use these positions for analyses outside of edgeCase!", ] print("\n".join(warning), file=stderr) return 0
def find_repeats(sequencefile, min_k, max_k, min_repeats, jellyfish, jellyfish_hash_size, collapse_reverse_complement, jobs, tempdir): """Find all repeats in sequencefile""" per_k_reports = [] k_iterator = progressbar( range(min_k, max_k + 1), desc="Sweeping lengths", unit="k", ) for k in k_iterator: db = path.join(tempdir, "{}.db".format(k)) jellyfish_count_options = [ jellyfish, "count", "-t", str(jobs), "-s", jellyfish_hash_size, "-L", "0", "-m", str(k * min_repeats) ] if collapse_reverse_complement: jellyfish_count_options += ["-C"] check_output(jellyfish_count_options + ["-o", db, sequencefile]) tsv = path.join(tempdir, "{}.tsv".format(k)) check_output([ jellyfish, "dump", "-c", "-t", "-L", "0", "-o", tsv, db, ]) k_report = read_csv(tsv, sep="\t", names=["kmer", "count"]) if len(k_report) == 0: return None repeats_indexer = k_report["kmer"].apply( lambda kmer: kmer[:k] * min_repeats == kmer) k_report = k_report[repeats_indexer] k_report["kmer"] = k_report["kmer"].apply(lambda kmer: kmer[:k]) k_report["length"] = k per_k_reports.append(k_report) return concat(per_k_reports, axis=0)
def pattern_scanner(entry_iterator, fmt, samfilters, motif_patterns, bin_size, num_reads, jobs): """Calculate density of pattern hits in a rolling window along each read""" if fmt == "sam": filtered_iterator = filter_bam(entry_iterator, samfilters) else: filtered_iterator = entry_iterator simple_entry_iterator = (SimpleNamespace( query_name=getattr(entry, "query_name", getattr(entry, "name", None)), flag=getattr(entry, "flag", None), reference_name=getattr(entry, "reference_name", None), reference_start=getattr(entry, "reference_start", None), mapping_quality=getattr(entry, "mapping_quality", None), query_sequence=getattr(entry, "query_sequence", getattr(entry, "sequence", None)), cigarstring=getattr(entry, "cigarstring", ""), ) for entry in filtered_iterator) with Pool(jobs) as pool: # imap_unordered() only accepts single-argument functions: density_calculator = partial( calculate_density_of_patterns, motif_patterns=motif_patterns, bin_size=bin_size, ) # lazy multiprocess evaluation: read_density_iterator = pool.imap_unordered( density_calculator, simple_entry_iterator, ) # iterate pairs (entry.query_name, density_array), same as calculate_density_of_patterns(): desc = "Calculating density" yield from progressbar( read_density_iterator, desc=desc, unit="read", total=num_reads, )
def get_unambiguous_entries(entries, ecx): """Subset candidate entries to those that map unambiguously""" entry_dispatcher, valid_qnames = make_entry_dispatchers(entries, ecx) chromosomes = set(ecx["chromosome"].drop_duplicates()) for qname in progressbar(valid_qnames, desc="Filtering", unit="read"): entry_mappings = entry_dispatcher[entry_dispatcher["qname"] == qname] entry_mapped_to_main = entry_mappings["rname"].isin(chromosomes) if entry_mapped_to_main.any(): # prefer canonical chromosomes target_entry_mappings = entry_mappings[entry_mapped_to_main] else: # fall back to forks / subtelomeres target_entry_mappings = entry_mappings rnames = target_entry_mappings["rname"].drop_duplicates() primes = target_entry_mappings["prime"].drop_duplicates() if (len(rnames) == 1) and (len(primes) == 1): if primes.iloc[0] == 3: # find innermost mappos on same q arm target_mappos = target_entry_mappings["mappos"].min() else: # find innermost mappos on same p arm target_mappos = target_entry_mappings["mappos"].max() entry_candidates = target_entry_mappings.loc[ target_entry_mappings["mappos"] == target_mappos, "entry", ] if len(entry_candidates) == 1: entry = entry_candidates.iloc[0] if entry.flag & 0x800 == 0: # non-supplementary yield entry