def __init__(self, params): self.params = params self.ncrf_report = NCRF_Report(params.ncrf) self.cloud_contig = CloudContig(params.min_cloud_kmer_freq) if params.genomic_kmers is not None: kmers = [] with open(params.genomic_kmers) as f: for line in f: kmers.append(line.strip()) self.genomic_kmers = set(kmers) else: self.genomic_kmers = None smart_makedirs(params.outdir) self.position_outfile = \ os.path.join(self.params.outdir, 'read_positions.csv')
def main(): params = parse_args() smart_makedirs(params.outdir) reads_ncrf_report = NCRF_Report(params.ncrf) rare_kmers = get_rare_kmers(reads_ncrf_report, k=params.k, bottom=params.bottom, top=params.top, coverage=params.coverage, kmer_survival_rate=params.kmer_survival_rate, max_nonuniq=params.max_nonuniq, verbose=params.verbose) reads_kmer_clouds = get_reads_kmer_clouds(reads_ncrf_report, n=1, k=params.k, genomic_kmers=rare_kmers) dist_cnt, kmer_index = get_kmer_dist_map(reads_kmer_clouds, rare_kmers, min_n=params.min_nreads, max_n=params.max_nreads, min_d=params.min_distance, max_d=params.max_distance, verbose=params.verbose) unique_kmers_ind, dist_edges = \ filter_dist_tuples(dist_cnt, min_coverage=params.min_coverage) output_results(kmer_index=kmer_index, min_coverage=params.min_coverage, unique_kmers_ind=unique_kmers_ind, dist_edges=dist_edges, outdir=params.outdir)
def __init__(self, params): self.params = params if not os.path.isfile(params.unit): raise FileNotFoundError(f"File {params.unit} is not found") self.unit = read_bio_seq(params.unit) self.ncrf_report = NCRF_Report(params.ncrf) self.motif_alignments = self.ncrf_report.get_motif_alignments() smart_makedirs(params.outdir) self.read_placement = read_reported_positions(params.read_placement) self.max_pos = self.params.max_pos self.min_pos = self.params.min_pos if self.max_pos == math.inf: self.max_pos = 0 for r_id, pos in self.read_placement.items(): if pos is None: continue ma = self.motif_alignments[r_id] self.max_pos = max(self.max_pos, pos + len(ma))
def main(): parser = argparse.ArgumentParser() parser.add_argument("--ncrf", help="Input NCRF", required=True) parser.add_argument("--seq", help="Input sequence", required=True) parser.add_argument("--buf", help="Buffer on the sides to include", type=int, default=20) parser.add_argument("--outdir", help="Output dir", required=True) params = parser.parse_args() smart_makedirs(params.outdir) ncrf_report = NCRF_Report(params.ncrf) input_seq = read_bio_seq(params.seq) all_mas = ncrf_report.get_motif_alignments() for seq_id, mas in all_mas.items(): record = ncrf_report.records[seq_id] units = {} coords = {} al_start = record.r_st alignment = record.r_al.replace('-', '') start = 0 for ma in mas: ma_st = ma.start ma_en = ma.end seq_al = record.r_al[ma_st:ma_en] seq = seq_al.replace('-', '') end = start + len(seq) seq_st = input_seq[al_start + start - params.buf:al_start + start] seq_en = input_seq[al_start + end:end + al_start + params.buf] seq = seq_st + seq + seq_en ma_id = f'{seq_id}|st_{start + al_start}|en_{end - 1 + al_start}' units[ma_id] = seq coords[ma_id] = (start + al_start, end + al_start) # print(input_seq[start+al_start:end+al_start]) # print(seq[params.buf:-params.buf]) assert input_seq[start + al_start - len(seq_st):end + al_start + len(seq_en)] == seq start = end outfile = os.path.join(params.outdir, f'{seq_id}.fasta') write_bio_seqs(outfile, units)
def main(): params = parse_args() outdir = os.path.dirname(params.output) smart_makedirs(outdir) reads_ncrf_report = NCRF_Report(params.reads_ncrf) unit_seq = read_bio_seq(params.unit) kmer_counts_reads, most_frequent_kmers = \ get_most_frequent_kmers(reads_ncrf_report, k=params.k, unit_seq=unit_seq) new_unit = get_polished_unit(k=params.k, most_frequent_kmers=most_frequent_kmers, kmer_counts_reads=kmer_counts_reads, unit_seq=unit_seq) write_bio_seqs(params.output, {'DXZ1*': new_unit})
class ELTR_Polisher: def __init__(self, params): self.params = params if not os.path.isfile(params.unit): raise FileNotFoundError(f"File {params.unit} is not found") self.unit = read_bio_seq(params.unit) self.ncrf_report = NCRF_Report(params.ncrf) self.motif_alignments = self.ncrf_report.get_motif_alignments() smart_makedirs(params.outdir) self.read_placement = read_reported_positions(params.read_placement) self.max_pos = self.params.max_pos self.min_pos = self.params.min_pos if self.max_pos == math.inf: self.max_pos = 0 for r_id, pos in self.read_placement.items(): if pos is None: continue ma = self.motif_alignments[r_id] self.max_pos = max(self.max_pos, pos + len(ma)) def map_pos2read(self): pos2read = defaultdict(list) for r_id, pos in self.read_placement.items(): if pos is None or pos > self.max_pos: continue ma = self.motif_alignments[r_id] if pos == self.min_pos or pos + len(ma) == self.max_pos: positions = range(len(ma)) else: positions = range(1, len(ma) - 1) for i in positions: if self.min_pos <= pos + i <= self.max_pos: pos2read[pos + i].append((r_id, i)) return pos2read def export_read_units(self, pos2read): filenames = {} for pos in pos2read: outdir = os.path.join(self.params.outdir, f'pos_{pos}') units_fn = os.path.join(outdir, 'read_units.fasta') median_read_unit_fn = \ os.path.join(outdir, 'median_read_unit.fasta') smart_makedirs(outdir) seqs = {} median_read_unit, template_read = "", None for (r_id, p) in pos2read[pos]: r_al = self.motif_alignments[r_id][p].r_al r_al = r_al.upper().replace('-', '') seqs[f'gen_pos={pos}|r_id={r_id}|r_pos={p}'] = r_al r_units_lens = [len(seq) for seq in seqs.values()] med_len = statistics.median_high(r_units_lens) median_r_ids = [] for r_id in sorted(seqs.keys()): r_al = seqs[r_id] if len(r_al) == med_len: median_read_unit = r_al template_read = r_id break assert len(seqs[template_read]) == med_len assert len(median_read_unit) == med_len write_bio_seqs(units_fn, seqs) write_bio_seqs(median_read_unit_fn, {template_read: median_read_unit}) filenames[pos] = (units_fn, median_read_unit_fn) return filenames def run_polishing(self, read_unit_filenames): min_pos = min(read_unit_filenames.keys()) max_pos = max(read_unit_filenames.keys()) for pos in range(min_pos, max_pos + 1): print(pos, max_pos) units_fn, median_read_unit_fn = read_unit_filenames[pos] pos_dir = os.path.dirname(units_fn) cmd = [self.params.flye_bin, f'--{self.params.error_mode}-raw', units_fn, '--polish-target', median_read_unit_fn, '-i', self.params.num_iters, '-t', self.params.num_threads, '-o', pos_dir] cmd = [str(x) for x in cmd] print(' '.join(cmd)) subprocess.check_call(cmd) def read_polishing(self, read_unit_filenames): min_pos = min(read_unit_filenames.keys()) max_pos = max(read_unit_filenames.keys()) polished_seqs = {} final_sequences = {} for i in range(1, self.params.num_iters + 1): for pos, (units_fn, longest_read_unit_fn) in read_unit_filenames.items(): pos_dir = os.path.dirname(units_fn) polished_seq_fn = os.path.join(pos_dir, f'polished_{i}.fasta') polished_seq = read_bio_seq(polished_seq_fn) polished_seqs[pos] = polished_seq final_sequence = \ [polished_seqs[pos] for pos in range(min_pos, max_pos + 1)] final_sequence = ''.join(final_sequence) final_sequences[i] = final_sequence return final_sequences def compare_polished_sequences(self, final_sequences): report_fn = os.path.join(self.params.outdir, 'report.txt') with open(report_fn, 'w') as f: for i in range(1, self.params.num_iters): seq_i, seq_i1 = final_sequences[i], final_sequences[i+1] alignment = edlib.align(seq_i, seq_i1) print(f'Alignment polishing seq {i} vs {i+1}:', file=f) print(alignment, file=f) hpc_seq_i = compress_homopolymer(final_sequences[i]) hpc_seq_i1 = compress_homopolymer(final_sequences[i+1]) alignment = edlib.align(hpc_seq_i, hpc_seq_i1) print(f'Alignment homopolymer compressed polishing seq {i} vs {i+1}:', file=f) print(alignment, file=f) def export_results(self, final_sequences): for i in range(1, self.params.num_iters + 1): final_sequence = final_sequences[i] final_sequence_hpc = compress_homopolymer(final_sequence) final_fn = os.path.join(self.params.outdir, f'final_sequence_{i}.fasta') write_bio_seqs(final_fn, {f'polished_repeat_{i}': final_sequence}) final_hpc_fn = os.path.join(self.params.outdir, f'final_sequence_hpc_{i}.fasta') write_bio_seqs(final_hpc_fn, {f'polished_repeat_{i}': final_sequence_hpc}) def run(self): pos2read = self.map_pos2read() read_unit_filenames = self.export_read_units(pos2read) self.run_polishing(read_unit_filenames) final_sequences = self.read_polishing(read_unit_filenames) self.compare_polished_sequences(final_sequences) self.export_results(final_sequences)
class ReadPlacer: def __init__(self, params): self.params = params self.ncrf_report = NCRF_Report(params.ncrf) self.cloud_contig = CloudContig(params.min_cloud_kmer_freq) if params.genomic_kmers is not None: kmers = [] with open(params.genomic_kmers) as f: for line in f: kmers.append(line.strip()) self.genomic_kmers = set(kmers) else: self.genomic_kmers = None smart_makedirs(params.outdir) self.position_outfile = \ os.path.join(self.params.outdir, 'read_positions.csv') def reset_cloud_contig(self): self.cloud_contig = CloudContig(self.params.min_cloud_kmer_freq) def add_prefix_reads(self, prefix_reads, reads_kmer_clouds): with open(self.position_outfile, 'w') as f: for r_id in prefix_reads: read_kmer_clouds = reads_kmer_clouds[r_id] self.cloud_contig.add_read(read_kmer_clouds, position=0) print(r_id, 0, file=f) def add_reads(self, reads, reads_kmer_clouds, min_unit, min_inters, min_prop=3): kmers2pos = defaultdict(list) for r_id in reads: kmer_clouds = reads_kmer_clouds[r_id] for i, cloud in enumerate(kmer_clouds.kmers): for kmer in cloud: kmers2pos[kmer].append((r_id, i)) unused_reads = set(reads) n_reads = len(unused_reads) scores = None freq_kmers = [] for kmer in self.cloud_contig.freq_kmers: for pos in self.cloud_contig.kmer_positions[kmer]: freq_kmers.append((kmer, pos)) with open(self.position_outfile, 'a') as f: while len(unused_reads): scores = update_mapping_scores(self.cloud_contig, kmers2pos, freq_kmers=freq_kmers, scores=scores) best_score, best_position, best_read = (-1, -1), None, None for r_id in unused_reads: for pos in scores[r_id]: score = scores[r_id][pos] score = (len(score), sum(score.values())) if (score > best_score and score[0] >= min_unit and score[0] * min_prop <= score[1] and score[1] >= min_inters) or \ (score == best_score and pos > best_position) or \ (score == best_score and pos == best_position and r_id < best_read): best_score = score best_position = pos best_read = r_id if best_read is None: print(f"Unused reads {len(unused_reads)}, {n_reads}, " f"{len(unused_reads) / n_reads}") for read in unused_reads: print(read, None, file=f) return print(best_score, best_position, best_read) print("") print(best_read, best_position, best_score[0], best_score[1], file=f) best_read_cloud = reads_kmer_clouds[best_read] freq_kmers = self.cloud_contig.add_read(best_read_cloud, position=best_position) unused_reads.remove(best_read) def run(self): left_PT_reads, FT_reads, right_PT_reads = \ self.ncrf_report.classify( large_threshold=self.params.prefix_threshold) print(f'Left: {len(left_PT_reads)}') print(f'FT: {len(FT_reads)}') print(f'Right: {len(right_PT_reads)}') print("Reading kmer clouds from reads") reads_kmer_clouds = \ get_reads_kmer_clouds(self.ncrf_report, n=self.params.n_motif, k=self.params.k_cloud, genomic_kmers=self.genomic_kmers) print("Filtering kmer clouds from reads") reads_kmer_clouds = \ filter_reads_kmer_clouds(reads_kmer_clouds, min_mult=self.params.min_kmer_mult) print("Adding prefix reads") self.add_prefix_reads(left_PT_reads, reads_kmer_clouds) print(self.cloud_contig.max_pos) print("Adding inner reads") self.add_reads(FT_reads, reads_kmer_clouds, min_unit=self.params.min_unit, min_inters=self.params.min_inters) print(self.cloud_contig.max_pos) print("\nNow adding suffix reads") self.add_reads(right_PT_reads, reads_kmer_clouds, min_unit=self.params.min_unit, min_inters=self.params.min_inters)