Exemple #1
0
 def __init__(self, params):
     self.params = params
     self.ncrf_report = NCRF_Report(params.ncrf)
     self.cloud_contig = CloudContig(params.min_cloud_kmer_freq)
     if params.genomic_kmers is not None:
         kmers = []
         with open(params.genomic_kmers) as f:
             for line in f:
                 kmers.append(line.strip())
         self.genomic_kmers = set(kmers)
     else:
         self.genomic_kmers = None
     smart_makedirs(params.outdir)
     self.position_outfile = \
         os.path.join(self.params.outdir, 'read_positions.csv')
def main():
    params = parse_args()
    smart_makedirs(params.outdir)

    reads_ncrf_report = NCRF_Report(params.ncrf)
    rare_kmers = get_rare_kmers(reads_ncrf_report,
                                k=params.k,
                                bottom=params.bottom,
                                top=params.top,
                                coverage=params.coverage,
                                kmer_survival_rate=params.kmer_survival_rate,
                                max_nonuniq=params.max_nonuniq,
                                verbose=params.verbose)

    reads_kmer_clouds = get_reads_kmer_clouds(reads_ncrf_report,
                                              n=1,
                                              k=params.k,
                                              genomic_kmers=rare_kmers)

    dist_cnt, kmer_index = get_kmer_dist_map(reads_kmer_clouds,
                                             rare_kmers,
                                             min_n=params.min_nreads,
                                             max_n=params.max_nreads,
                                             min_d=params.min_distance,
                                             max_d=params.max_distance,
                                             verbose=params.verbose)

    unique_kmers_ind, dist_edges = \
        filter_dist_tuples(dist_cnt, min_coverage=params.min_coverage)

    output_results(kmer_index=kmer_index,
                   min_coverage=params.min_coverage,
                   unique_kmers_ind=unique_kmers_ind,
                   dist_edges=dist_edges,
                   outdir=params.outdir)
Exemple #3
0
 def __init__(self, params):
     self.params = params
     if not os.path.isfile(params.unit):
         raise FileNotFoundError(f"File {params.unit} is not found")
     self.unit = read_bio_seq(params.unit)
     self.ncrf_report = NCRF_Report(params.ncrf)
     self.motif_alignments = self.ncrf_report.get_motif_alignments()
     smart_makedirs(params.outdir)
     self.read_placement = read_reported_positions(params.read_placement)
     self.max_pos = self.params.max_pos
     self.min_pos = self.params.min_pos
     if self.max_pos == math.inf:
         self.max_pos = 0
         for r_id, pos in self.read_placement.items():
             if pos is None:
                 continue
             ma = self.motif_alignments[r_id]
             self.max_pos = max(self.max_pos, pos + len(ma))
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--ncrf", help="Input NCRF", required=True)
    parser.add_argument("--seq", help="Input sequence", required=True)
    parser.add_argument("--buf",
                        help="Buffer on the sides to include",
                        type=int,
                        default=20)
    parser.add_argument("--outdir", help="Output dir", required=True)
    params = parser.parse_args()

    smart_makedirs(params.outdir)
    ncrf_report = NCRF_Report(params.ncrf)
    input_seq = read_bio_seq(params.seq)
    all_mas = ncrf_report.get_motif_alignments()
    for seq_id, mas in all_mas.items():
        record = ncrf_report.records[seq_id]
        units = {}
        coords = {}
        al_start = record.r_st
        alignment = record.r_al.replace('-', '')
        start = 0
        for ma in mas:
            ma_st = ma.start
            ma_en = ma.end
            seq_al = record.r_al[ma_st:ma_en]
            seq = seq_al.replace('-', '')
            end = start + len(seq)
            seq_st = input_seq[al_start + start - params.buf:al_start + start]
            seq_en = input_seq[al_start + end:end + al_start + params.buf]
            seq = seq_st + seq + seq_en
            ma_id = f'{seq_id}|st_{start + al_start}|en_{end - 1 + al_start}'
            units[ma_id] = seq
            coords[ma_id] = (start + al_start, end + al_start)
            # print(input_seq[start+al_start:end+al_start])
            # print(seq[params.buf:-params.buf])
            assert input_seq[start + al_start - len(seq_st):end + al_start +
                             len(seq_en)] == seq
            start = end
        outfile = os.path.join(params.outdir, f'{seq_id}.fasta')
        write_bio_seqs(outfile, units)
def main():
    params = parse_args()
    outdir = os.path.dirname(params.output)
    smart_makedirs(outdir)

    reads_ncrf_report = NCRF_Report(params.reads_ncrf)
    unit_seq = read_bio_seq(params.unit)

    kmer_counts_reads, most_frequent_kmers = \
        get_most_frequent_kmers(reads_ncrf_report,
                                k=params.k,
                                unit_seq=unit_seq)

    new_unit = get_polished_unit(k=params.k,
                                 most_frequent_kmers=most_frequent_kmers,
                                 kmer_counts_reads=kmer_counts_reads,
                                 unit_seq=unit_seq)

    write_bio_seqs(params.output, {'DXZ1*': new_unit})
Exemple #6
0
class ELTR_Polisher:
    def __init__(self, params):
        self.params = params
        if not os.path.isfile(params.unit):
            raise FileNotFoundError(f"File {params.unit} is not found")
        self.unit = read_bio_seq(params.unit)
        self.ncrf_report = NCRF_Report(params.ncrf)
        self.motif_alignments = self.ncrf_report.get_motif_alignments()
        smart_makedirs(params.outdir)
        self.read_placement = read_reported_positions(params.read_placement)
        self.max_pos = self.params.max_pos
        self.min_pos = self.params.min_pos
        if self.max_pos == math.inf:
            self.max_pos = 0
            for r_id, pos in self.read_placement.items():
                if pos is None:
                    continue
                ma = self.motif_alignments[r_id]
                self.max_pos = max(self.max_pos, pos + len(ma))

    def map_pos2read(self):
        pos2read = defaultdict(list)
        for r_id, pos in self.read_placement.items():
            if pos is None or pos > self.max_pos:
                continue
            ma = self.motif_alignments[r_id]
            if pos == self.min_pos or pos + len(ma) == self.max_pos:
                positions = range(len(ma))
            else:
                positions = range(1, len(ma) - 1)
            for i in positions:
                if self.min_pos <= pos + i <= self.max_pos:
                    pos2read[pos + i].append((r_id, i))
        return pos2read

    def export_read_units(self, pos2read):
        filenames = {}
        for pos in pos2read:
            outdir = os.path.join(self.params.outdir, f'pos_{pos}')
            units_fn = os.path.join(outdir, 'read_units.fasta')
            median_read_unit_fn = \
                os.path.join(outdir, 'median_read_unit.fasta')
            smart_makedirs(outdir)
            seqs = {}
            median_read_unit, template_read = "", None
            for (r_id, p) in pos2read[pos]:
                r_al = self.motif_alignments[r_id][p].r_al
                r_al = r_al.upper().replace('-', '')
                seqs[f'gen_pos={pos}|r_id={r_id}|r_pos={p}'] = r_al
            r_units_lens = [len(seq) for seq in seqs.values()]
            med_len = statistics.median_high(r_units_lens)
            median_r_ids = []
            for r_id in sorted(seqs.keys()):
                r_al = seqs[r_id]
                if len(r_al) == med_len:
                    median_read_unit = r_al
                    template_read = r_id
                    break
            assert len(seqs[template_read]) == med_len
            assert len(median_read_unit) == med_len
            write_bio_seqs(units_fn, seqs)
            write_bio_seqs(median_read_unit_fn,
                           {template_read: median_read_unit})
            filenames[pos] = (units_fn, median_read_unit_fn)
        return filenames

    def run_polishing(self, read_unit_filenames):
        min_pos = min(read_unit_filenames.keys())
        max_pos = max(read_unit_filenames.keys())
        for pos in range(min_pos, max_pos + 1):
            print(pos, max_pos)
            units_fn, median_read_unit_fn = read_unit_filenames[pos]
            pos_dir = os.path.dirname(units_fn)
            cmd = [self.params.flye_bin,
                   f'--{self.params.error_mode}-raw', units_fn,
                   '--polish-target', median_read_unit_fn,
                   '-i', self.params.num_iters,
                   '-t', self.params.num_threads,
                   '-o', pos_dir]
            cmd = [str(x) for x in cmd]
            print(' '.join(cmd))
            subprocess.check_call(cmd)

    def read_polishing(self, read_unit_filenames):
        min_pos = min(read_unit_filenames.keys())
        max_pos = max(read_unit_filenames.keys())
        polished_seqs = {}
        final_sequences = {}
        for i in range(1, self.params.num_iters + 1):
            for pos, (units_fn, longest_read_unit_fn) in read_unit_filenames.items():
                pos_dir = os.path.dirname(units_fn)
                polished_seq_fn = os.path.join(pos_dir, f'polished_{i}.fasta')
                polished_seq = read_bio_seq(polished_seq_fn)
                polished_seqs[pos] = polished_seq
            final_sequence = \
                [polished_seqs[pos] for pos in range(min_pos, max_pos + 1)]
            final_sequence = ''.join(final_sequence)
            final_sequences[i] = final_sequence
        return final_sequences

    def compare_polished_sequences(self, final_sequences):
        report_fn = os.path.join(self.params.outdir, 'report.txt')
        with open(report_fn, 'w') as f:
            for i in range(1, self.params.num_iters):
                seq_i, seq_i1 = final_sequences[i], final_sequences[i+1]
                alignment = edlib.align(seq_i, seq_i1)
                print(f'Alignment polishing seq {i} vs {i+1}:', file=f)
                print(alignment, file=f)

                hpc_seq_i = compress_homopolymer(final_sequences[i])
                hpc_seq_i1 = compress_homopolymer(final_sequences[i+1])
                alignment = edlib.align(hpc_seq_i, hpc_seq_i1)
                print(f'Alignment homopolymer compressed polishing seq {i} vs {i+1}:', file=f)
                print(alignment, file=f)

    def export_results(self, final_sequences):
        for i in range(1, self.params.num_iters + 1):
            final_sequence = final_sequences[i]
            final_sequence_hpc = compress_homopolymer(final_sequence)

            final_fn = os.path.join(self.params.outdir, f'final_sequence_{i}.fasta')
            write_bio_seqs(final_fn, {f'polished_repeat_{i}': final_sequence})

            final_hpc_fn = os.path.join(self.params.outdir, f'final_sequence_hpc_{i}.fasta')
            write_bio_seqs(final_hpc_fn, {f'polished_repeat_{i}': final_sequence_hpc})

    def run(self):
        pos2read = self.map_pos2read()
        read_unit_filenames = self.export_read_units(pos2read)
        self.run_polishing(read_unit_filenames)
        final_sequences = self.read_polishing(read_unit_filenames)
        self.compare_polished_sequences(final_sequences)
        self.export_results(final_sequences)
Exemple #7
0
class ReadPlacer:
    def __init__(self, params):
        self.params = params
        self.ncrf_report = NCRF_Report(params.ncrf)
        self.cloud_contig = CloudContig(params.min_cloud_kmer_freq)
        if params.genomic_kmers is not None:
            kmers = []
            with open(params.genomic_kmers) as f:
                for line in f:
                    kmers.append(line.strip())
            self.genomic_kmers = set(kmers)
        else:
            self.genomic_kmers = None
        smart_makedirs(params.outdir)
        self.position_outfile = \
            os.path.join(self.params.outdir, 'read_positions.csv')

    def reset_cloud_contig(self):
        self.cloud_contig = CloudContig(self.params.min_cloud_kmer_freq)

    def add_prefix_reads(self, prefix_reads, reads_kmer_clouds):
        with open(self.position_outfile, 'w') as f:
            for r_id in prefix_reads:
                read_kmer_clouds = reads_kmer_clouds[r_id]
                self.cloud_contig.add_read(read_kmer_clouds, position=0)
                print(r_id, 0, file=f)

    def add_reads(self,
                  reads,
                  reads_kmer_clouds,
                  min_unit,
                  min_inters,
                  min_prop=3):
        kmers2pos = defaultdict(list)
        for r_id in reads:
            kmer_clouds = reads_kmer_clouds[r_id]
            for i, cloud in enumerate(kmer_clouds.kmers):
                for kmer in cloud:
                    kmers2pos[kmer].append((r_id, i))

        unused_reads = set(reads)
        n_reads = len(unused_reads)
        scores = None
        freq_kmers = []
        for kmer in self.cloud_contig.freq_kmers:
            for pos in self.cloud_contig.kmer_positions[kmer]:
                freq_kmers.append((kmer, pos))
        with open(self.position_outfile, 'a') as f:
            while len(unused_reads):
                scores = update_mapping_scores(self.cloud_contig,
                                               kmers2pos,
                                               freq_kmers=freq_kmers,
                                               scores=scores)
                best_score, best_position, best_read = (-1, -1), None, None
                for r_id in unused_reads:
                    for pos in scores[r_id]:
                        score = scores[r_id][pos]
                        score = (len(score), sum(score.values()))
                        if (score > best_score and
                                score[0] >= min_unit and
                                score[0] * min_prop <= score[1] and
                                score[1] >= min_inters) or \
                            (score == best_score and pos > best_position) or \
                                (score == best_score and
                                 pos == best_position and
                                 r_id < best_read):
                            best_score = score
                            best_position = pos
                            best_read = r_id
                if best_read is None:
                    print(f"Unused reads {len(unused_reads)}, {n_reads}, "
                          f"{len(unused_reads) / n_reads}")
                    for read in unused_reads:
                        print(read, None, file=f)
                    return
                print(best_score, best_position, best_read)
                print("")
                print(best_read,
                      best_position,
                      best_score[0],
                      best_score[1],
                      file=f)
                best_read_cloud = reads_kmer_clouds[best_read]

                freq_kmers = self.cloud_contig.add_read(best_read_cloud,
                                                        position=best_position)
                unused_reads.remove(best_read)

    def run(self):
        left_PT_reads, FT_reads, right_PT_reads = \
            self.ncrf_report.classify(
                large_threshold=self.params.prefix_threshold)

        print(f'Left: {len(left_PT_reads)}')
        print(f'FT: {len(FT_reads)}')
        print(f'Right: {len(right_PT_reads)}')

        print("Reading kmer clouds from reads")
        reads_kmer_clouds = \
            get_reads_kmer_clouds(self.ncrf_report,
                                  n=self.params.n_motif,
                                  k=self.params.k_cloud,
                                  genomic_kmers=self.genomic_kmers)
        print("Filtering kmer clouds from reads")
        reads_kmer_clouds = \
            filter_reads_kmer_clouds(reads_kmer_clouds,
                                     min_mult=self.params.min_kmer_mult)
        print("Adding prefix reads")
        self.add_prefix_reads(left_PT_reads, reads_kmer_clouds)
        print(self.cloud_contig.max_pos)

        print("Adding inner reads")
        self.add_reads(FT_reads,
                       reads_kmer_clouds,
                       min_unit=self.params.min_unit,
                       min_inters=self.params.min_inters)
        print(self.cloud_contig.max_pos)

        print("\nNow adding suffix reads")
        self.add_reads(right_PT_reads,
                       reads_kmer_clouds,
                       min_unit=self.params.min_unit,
                       min_inters=self.params.min_inters)