Esempio n. 1
0
 def export_read_units(self, pos2read):
     filenames = {}
     for pos in pos2read:
         outdir = os.path.join(self.params.outdir, f'pos_{pos}')
         units_fn = os.path.join(outdir, 'read_units.fasta')
         median_read_unit_fn = \
             os.path.join(outdir, 'median_read_unit.fasta')
         smart_makedirs(outdir)
         seqs = {}
         median_read_unit, template_read = "", None
         for (r_id, p) in pos2read[pos]:
             r_al = self.motif_alignments[r_id][p].r_al
             r_al = r_al.upper().replace('-', '')
             seqs[f'gen_pos={pos}|r_id={r_id}|r_pos={p}'] = r_al
         r_units_lens = [len(seq) for seq in seqs.values()]
         med_len = statistics.median_high(r_units_lens)
         median_r_ids = []
         for r_id in sorted(seqs.keys()):
             r_al = seqs[r_id]
             if len(r_al) == med_len:
                 median_read_unit = r_al
                 template_read = r_id
                 break
         assert len(seqs[template_read]) == med_len
         assert len(median_read_unit) == med_len
         write_bio_seqs(units_fn, seqs)
         write_bio_seqs(median_read_unit_fn,
                        {template_read: median_read_unit})
         filenames[pos] = (units_fn, median_read_unit_fn)
     return filenames
def main():
    params = parse_args()
    smart_makedirs(params.outdir)

    reads_ncrf_report = NCRF_Report(params.ncrf)
    rare_kmers = get_rare_kmers(reads_ncrf_report,
                                k=params.k,
                                bottom=params.bottom,
                                top=params.top,
                                coverage=params.coverage,
                                kmer_survival_rate=params.kmer_survival_rate,
                                max_nonuniq=params.max_nonuniq,
                                verbose=params.verbose)

    reads_kmer_clouds = get_reads_kmer_clouds(reads_ncrf_report,
                                              n=1,
                                              k=params.k,
                                              genomic_kmers=rare_kmers)

    dist_cnt, kmer_index = get_kmer_dist_map(reads_kmer_clouds,
                                             rare_kmers,
                                             min_n=params.min_nreads,
                                             max_n=params.max_nreads,
                                             min_d=params.min_distance,
                                             max_d=params.max_distance,
                                             verbose=params.verbose)

    unique_kmers_ind, dist_edges = \
        filter_dist_tuples(dist_cnt, min_coverage=params.min_coverage)

    output_results(kmer_index=kmer_index,
                   min_coverage=params.min_coverage,
                   unique_kmers_ind=unique_kmers_ind,
                   dist_edges=dist_edges,
                   outdir=params.outdir)
Esempio n. 3
0
    def from_read_db_and_assembly(cls, gr_reads, assembly, outdir=None):
        k = gr_reads.k
        gr_assembly, _ = sequence_graph.idb_graph.get_db_monostring_set(
            assembly, k=k, outdir=None, mode='assembly')
        color3graph = cls.from_db_graphs(gr_assembly=gr_assembly,
                                         gr_reads=gr_reads)
        if outdir is not None:
            smart_makedirs(outdir)

            asm_dot_file = os.path.join(outdir, f'db_asm_k{k}.dot')
            gr_assembly.write_dot(outfile=asm_dot_file, export_pdf=False)

            asm_dot_compact_file = os.path.join(outdir,
                                                f'db_asm_k{k}_compact.dot')
            gr_assembly.write_dot(outfile=asm_dot_compact_file,
                                  export_pdf=True,
                                  compact=True)
            asm_pickle_file = os.path.join(outdir, f'db_asm_k{k}.pickle')
            gr_assembly.pickle_dump(asm_pickle_file)

            c3g_dot_file = os.path.join(outdir, f'c3g_k{k}.dot')
            color3graph.write_dot(outfile=c3g_dot_file,
                                  export_pdf=True,
                                  compact=True)
            c3g_pickle_file = os.path.join(outdir, f'c3g_k{k}.pickle')
            color3graph.pickle_dump(c3g_pickle_file)
        return color3graph
Esempio n. 4
0
def iterative_graph(monostrings,
                    min_k,
                    max_k,
                    outdir,
                    min_mult=5,
                    step=1,
                    starting_graph=None,
                    verbose=True):
    smart_makedirs(outdir)
    dbs, all_contigs = {}, {}
    all_frequent_kmers, all_frequent_kmers_read_pos = {}, {}
    strings = {k: ''.join(v.string) for k, v in monostrings.items()}
    input_strings = strings.copy()
    complex_kp1mers = {}

    if starting_graph is not None:
        contigs, contig_paths = starting_graph.get_contigs()
        for i in range(len(contigs)):
            for j in range(min_mult):
                input_strings[f'contig_k{min_k}_i{i}_j{j}'] = contigs[i]

        complex_kp1mers = get_paths_thru_complex_nodes(starting_graph, strings)

    for k in range(min_k, max_k + 1, step):
        frequent_kmers, frequent_kmers_read_pos = \
            get_frequent_kmers(input_strings, k=k, min_mult=min_mult)
        frequent_kmers.update(complex_kp1mers)
        if verbose:
            print(f'\nk={k}')
            print(f'#frequent kmers = {len(frequent_kmers)}')
        all_frequent_kmers[k] = frequent_kmers
        all_frequent_kmers_read_pos[k] = frequent_kmers_read_pos

        db = DeBruijnGraph(k=k)
        db.add_kmers(frequent_kmers, coverage=frequent_kmers)

        db.collapse_nonbranching_paths()
        if verbose and nx.number_weakly_connected_components(db.graph) > 1:
            print(f'#cc = {nx.number_weakly_connected_components(db.graph)}')
            for cc in nx.weakly_connected_components(db.graph):
                print(len(cc))
            # break
        dbs[k] = db

        dot_file = os.path.join(outdir, f'db_k{k}.dot')
        # pdf_file = os.path.join(outdir, f'db_k{k}.pdf')
        nx.drawing.nx_pydot.write_dot(db.graph, dot_file)
        # os.system(f"dot -Tpdf {dot_file} -o {pdf_file}")

        contigs, contig_paths = db.get_contigs()
        all_contigs[k] = contigs

        input_strings = strings.copy()
        for i in range(len(contigs)):
            for j in range(min_mult):
                input_strings[f'contig_k{k}_i{i}_j{j}'] = contigs[i]

        complex_kp1mers = get_paths_thru_complex_nodes(db, strings)

    return all_contigs, dbs, all_frequent_kmers, all_frequent_kmers_read_pos
Esempio n. 5
0
 def __init__(self, params):
     self.params = params
     self.ncrf_report = NCRF_Report(params.ncrf)
     self.cloud_contig = CloudContig(params.min_cloud_kmer_freq)
     if params.genomic_kmers is not None:
         kmers = []
         with open(params.genomic_kmers) as f:
             for line in f:
                 kmers.append(line.strip())
         self.genomic_kmers = set(kmers)
     else:
         self.genomic_kmers = None
     smart_makedirs(params.outdir)
     self.position_outfile = \
         os.path.join(self.params.outdir, 'read_positions.csv')
Esempio n. 6
0
    def toDB(self, outdir=None, assembly=None):
        nx_graph = nx.MultiDiGraph()
        nodeindex2label = {}
        nodelabel2index = {}
        for i, (u, v, key) in self.index2edge.items():
            seq = tuple(self.edge2seq[i])
            u_label = seq[:self.k - 1]
            v_label = seq[-self.k + 1:]
            nodelabel2index[u_label] = u
            nodelabel2index[v_label] = v
            nodeindex2label[u] = u_label
            nodeindex2label[v] = v_label
            edge_len = len(seq) - self.k + 1
            cov = [1] * edge_len
            mean_cov = np.mean(cov)
            label = f'index={i}\nlen={edge_len}\ncov={mean_cov:0.2f}'
            nx_graph.add_edge(u,
                              v,
                              key=key,
                              coverage=cov,
                              edge_index=i,
                              edge_len=edge_len,
                              label=label,
                              string=seq,
                              color='black')
        db = DeBruijnGraph(k=self.k,
                           nx_graph=nx_graph,
                           nodeindex2label=nodeindex2label,
                           nodelabel2index=nodelabel2index)
        if outdir is not None:
            smart_makedirs(outdir)
            dot_file = os.path.join(outdir, f'db_K{self.k}.dot')
            db.write_dot(outfile=dot_file, export_pdf=False)

            dot_compact_file = os.path.join(outdir,
                                            f'db_K{self.k}_compact.dot')
            db.write_dot(outfile=dot_compact_file,
                         export_pdf=True,
                         compact=True)
            db.write_dot(outfile=dot_file, export_pdf=False)
            pickle_file = os.path.join(outdir, f'db_K{self.k}.pickle')
            db.pickle_dump(pickle_file)
            if assembly is not None:
                DeBruijnGraph3Color.from_read_db_and_assembly(
                    gr_reads=db, assembly=assembly, outdir=outdir)
        return db
Esempio n. 7
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i",
                        "--input",
                        help="Directory with read units",
                        required=True)
    # parser.add_argument("-r",
    #                     "--reads",
    #                     help="Input reads",
    #                     required=True)
    parser.add_argument("-o",
                        "--outdir",
                        help="Output directory",
                        required=True)
    parser.add_argument("-b",
                        "--bin-size",
                        help="bin size",
                        type=int,
                        default=50)
    params = parser.parse_args()
    smart_makedirs(params.outdir)

    # reads = read_bio_seqs(params.reads)

    units = get_units(params.input)
    unit_lens = sorted(len(unit) for unit in units.values())
    periods, bin_convs, bin_left, bin_right = \
        get_period_info(unit_lens, bin_size=params.bin_size)

    # Currently support only one cluster
    filt_units = \
        {k: v for k, v in units.items() if bin_left <= len(v) <= bin_right}
    filt_units_fn = os.path.join(params.outdir, 'cluster_units.fasta')
    write_bio_seqs(filt_units_fn, filt_units)

    median_unit_id, median_unit, median_len = select_median_seq(filt_units)
    median_read_unit_fn = os.path.join(params.outdir, 'median_read_unit.fasta')
    write_bio_seqs(median_read_unit_fn, {median_unit_id: median_unit})

    cmd = [
        'flye', f'--nano-raw', filt_units_fn, '--polish-target',
        median_read_unit_fn, '-i', 2, '-t', 50, '-o', params.outdir
    ]
    cmd = [str(x) for x in cmd]
    subprocess.check_call(cmd)
Esempio n. 8
0
 def __init__(self, params):
     self.params = params
     if not os.path.isfile(params.unit):
         raise FileNotFoundError(f"File {params.unit} is not found")
     self.unit = read_bio_seq(params.unit)
     self.ncrf_report = NCRF_Report(params.ncrf)
     self.motif_alignments = self.ncrf_report.get_motif_alignments()
     smart_makedirs(params.outdir)
     self.read_placement = read_reported_positions(params.read_placement)
     self.max_pos = self.params.max_pos
     self.min_pos = self.params.min_pos
     if self.max_pos == math.inf:
         self.max_pos = 0
         for r_id, pos in self.read_placement.items():
             if pos is None:
                 continue
             ma = self.motif_alignments[r_id]
             self.max_pos = max(self.max_pos, pos + len(ma))
Esempio n. 9
0
def run_on_read(seq, seq_id, k, bin_size, outdir):
    print("Getting repetitive kmers")
    rep_kmers = get_repetitive_kmers(seq, k)
    print("Getting union convolution")
    conv, union_conv = get_convolution(rep_kmers)
    print("Getting periods")
    periods, bin_convs, bin_left, bin_right = \
        get_period_info(union_conv, bin_size=bin_size)
    print(f"Selected period = {periods[0]}")
    print("Getting hook")
    hook = get_hook_kmer(conv, bin_left, bin_right)
    if hook is None:
        return
    print("Splitting by hook")
    splits = split_by_hook(seq, hook)
    med_len = \
        statistics.median_high([len(x) for x in splits.values()])

    for r_id in sorted(splits.keys()):
        r_al = splits[r_id]
        if len(r_al) == med_len:
            median_read_unit = r_al
            template_read = r_id
            break
    read_outdir = os.path.join(outdir, seq_id[:8])
    smart_makedirs(read_outdir)
    splits_outfile = os.path.join(read_outdir, 'splits.fasta')
    median_read_unit_fn = os.path.join(read_outdir, 'median_read_unit.fasta')
    write_bio_seqs(splits_outfile, splits)
    write_bio_seqs(median_read_unit_fn, {template_read: median_read_unit})

    print("Running Flye")
    cmd = [
        'flye', f'--nano-raw', splits_outfile, '--polish-target',
        median_read_unit_fn, '-i', 2, '-t', 50, '-o', read_outdir
    ]
    cmd = [str(x) for x in cmd]
    subprocess.check_call(cmd)

    plt.hist(union_conv, bins=100)
    plt.title(f'Tandem read convolution, {seq_id[:8]}, period={periods[0]}')
    plt.savefig(os.path.join(read_outdir, f'{seq_id[:8]}.pdf'), format='pdf')
    plt.close()
def main():
    params = parse_args()
    outdir = os.path.dirname(params.output)
    smart_makedirs(outdir)

    reads_ncrf_report = NCRF_Report(params.reads_ncrf)
    unit_seq = read_bio_seq(params.unit)

    kmer_counts_reads, most_frequent_kmers = \
        get_most_frequent_kmers(reads_ncrf_report,
                                k=params.k,
                                unit_seq=unit_seq)

    new_unit = get_polished_unit(k=params.k,
                                 most_frequent_kmers=most_frequent_kmers,
                                 kmer_counts_reads=kmer_counts_reads,
                                 unit_seq=unit_seq)

    write_bio_seqs(params.output, {'DXZ1*': new_unit})
Esempio n. 11
0
def output_results(tr, left_flanked_tr, flanked_tr, all_muts, output_dir):
    smart_makedirs(output_dir)
    write_bio_seqs(os.path.join(output_dir, 'tandem_repeat.fasta'),
                   {'sim_tr': tr})
    write_bio_seqs(os.path.join(output_dir,
                                'left_flanked_tandem_repeat.fasta'),
                   {'left_flanked_sim_tr': left_flanked_tr})
    write_bio_seqs(os.path.join(output_dir, 'flanked_tandem_repeat.fasta'),
                   {'flanked_sim_tr': flanked_tr})
    with open(os.path.join(output_dir, 'all_muts.json'), 'w') as f:
        all_muts = dict(all_muts)
        all_muts = stringify_keys(all_muts)
        print(json.dumps(all_muts), file=f)
    with open(os.path.join(output_dir, 'simulation.log'), 'w') as f:
        total_n_mut = sum(len(x) for x in all_muts.values())
        print(f'full_tr_len = {len(tr)}', file=f)
        print(f'total_n_mut = {total_n_mut}', file=f)
        for pos, muts in all_muts.items():
            print(f'{pos} : {len(muts)}', file=f)
Esempio n. 12
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--ncrf", help="Input NCRF", required=True)
    parser.add_argument("--seq", help="Input sequence", required=True)
    parser.add_argument("--buf",
                        help="Buffer on the sides to include",
                        type=int,
                        default=20)
    parser.add_argument("--outdir", help="Output dir", required=True)
    params = parser.parse_args()

    smart_makedirs(params.outdir)
    ncrf_report = NCRF_Report(params.ncrf)
    input_seq = read_bio_seq(params.seq)
    all_mas = ncrf_report.get_motif_alignments()
    for seq_id, mas in all_mas.items():
        record = ncrf_report.records[seq_id]
        units = {}
        coords = {}
        al_start = record.r_st
        alignment = record.r_al.replace('-', '')
        start = 0
        for ma in mas:
            ma_st = ma.start
            ma_en = ma.end
            seq_al = record.r_al[ma_st:ma_en]
            seq = seq_al.replace('-', '')
            end = start + len(seq)
            seq_st = input_seq[al_start + start - params.buf:al_start + start]
            seq_en = input_seq[al_start + end:end + al_start + params.buf]
            seq = seq_st + seq + seq_en
            ma_id = f'{seq_id}|st_{start + al_start}|en_{end - 1 + al_start}'
            units[ma_id] = seq
            coords[ma_id] = (start + al_start, end + al_start)
            # print(input_seq[start+al_start:end+al_start])
            # print(seq[params.buf:-params.buf])
            assert input_seq[start + al_start - len(seq_st):end + al_start +
                             len(seq_en)] == seq
            start = end
        outfile = os.path.join(params.outdir, f'{seq_id}.fasta')
        write_bio_seqs(outfile, units)
Esempio n. 13
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--input", help="input reads", required=True)
    parser.add_argument("-o",
                        "--outdir",
                        help="Output directory",
                        required=True)
    parser.add_argument("-k", help="kmer len", type=int, default=15)
    parser.add_argument("-b",
                        "--bin-size",
                        help="bin size",
                        type=int,
                        default=10)
    params = parser.parse_args()
    smart_makedirs(params.outdir)

    reads = read_bio_seqs(params.input)
    for r_id, seq in reads.items():
        run_on_read(seq,
                    seq_id=r_id,
                    k=params.k,
                    bin_size=params.bin_size,
                    outdir=params.outdir)
Esempio n. 14
0
def get_idb(string_set,
            mink, maxk,
            outdir,
            mode='ont',
            assembly=None,
            get_min_mult=None,
            get_frequent_kmers=None,
            all_kmer_index=None,
            ignored_chars=None,
            step=1):

    if outdir is not None:
        logger.info(f'IDB will be saved to {outdir}')
        smart_makedirs(outdir)
    else:
        logger.info('IDB will not be saved — outdir is None')

    assert mode in ['ont', 'hifi', 'assembly']
    if get_min_mult is None:
        get_min_mult = def_get_min_mult
    if get_frequent_kmers is None:
        get_frequent_kmers = def_get_frequent_kmers

    if all_kmer_index is None:
        all_kmer_index = get_kmer_index(seqs=string_set,
                                        mink=mink, maxk=maxk,
                                        ignored_chars=ignored_chars)
    else:
        assert all(k in all_kmer_index.keys()
                   for k in range(mink, maxk+1, step))

    dbs = {}
    all_frequent_kmers = {}

    contig_kmers = {}
    complex_kp1mers = {}
    for k in range(mink, maxk+1, step):
        min_mult = get_min_mult(k=k, mode=mode)
        kmer_index = all_kmer_index[k]
        frequent_kmers = get_frequent_kmers(kmer_index=kmer_index,
                                            string_set=string_set,
                                            min_mult=min_mult)
        # extending frequent kmers with contig kmers
        for kmer, cnt in contig_kmers.items():
            if kmer not in frequent_kmers:
                frequent_kmers[kmer] = cnt

        # extending frequent kmers with k+1-mers that pass through complex
        # nodes
        for kmer, cnt in complex_kp1mers.items():
            if kmer in frequent_kmers:
                assert cnt == frequent_kmers[kmer]
        frequent_kmers.update(complex_kp1mers)

        all_frequent_kmers[k] = frequent_kmers

        logger.info(f'k={k}')
        logger.info(f'#frequent kmers = {len(frequent_kmers)}')
        logger.info(f'min_mult = {min_mult}')

        db = DeBruijnGraph.from_kmers(kmers=frequent_kmers.keys(),
                                      kmer_coverages=frequent_kmers,
                                      min_tip_cov=min_mult)
        ncc = nx.number_weakly_connected_components(db.nx_graph)
        logger.info(f'#cc = {ncc}')
        for i, cc in enumerate(nx.weakly_connected_components(db.nx_graph)):
            logger.info(f'{i}-th cc is of size = {len(cc)}')

        if outdir is not None:
            dot_file = os.path.join(outdir, f'db_k{k}.dot')
            db.write_dot(outfile=dot_file, export_pdf=False)

            dot_compact_file = os.path.join(outdir, f'db_k{k}_compact.dot')
            db.write_dot(outfile=dot_compact_file,
                         export_pdf=True,
                         compact=True)
            pickle_file = os.path.join(outdir, f'db_k{k}.pickle')
            db.pickle_dump(pickle_file)

            if assembly is not None:
                sequence_graph.db_graph_3col.DeBruijnGraph3Color.\
                    from_read_db_and_assembly(gr_reads=db,
                                              assembly=assembly,
                                              outdir=outdir)
        dbs[k] = db

        if k < maxk:
            contigs, _ = db.get_contigs()
            contig_kmers = Counter()
            for contig in contigs:
                for i in range(len(contig)-(k+1)+1):
                    kmer = contig[i:i+k+1]
                    contig_kmers[kmer] += 1

            complex_kp1mers = \
                db.get_paths_thru_complex_nodes(all_kmer_index[k+1])
    return dbs, all_frequent_kmers
Esempio n. 15
0
def main():
    params = parse_args()
    smart_makedirs(params.outdir)

    print('Reading report')
    sd_report = SD_Report(SD_report_fn=params.sd_report,
                          monomers_fn=params.monomers)

    print('Error correcting monoreads')
    ec_monostrings = error_correction(sd_report.monostrings,
                                      verbose=True,
                                      inplace=False)

    print('Building the graph')
    contigs, dbs, all_frequent_kmers, all_frequent_kmers_read_pos = \
        iterative_graph(ec_monostrings,
                        min_k=params.min_k,
                        max_k=params.max_k,
                        outdir=os.path.join(params.outdir, 'idb'),
                        min_mult=params.min_mult)
    db = dbs[params.max_k]

    print('Mapping reads to the graph')
    mappings = db.map_reads(ec_monostrings, verbose=False)

    print('Scaffolding')
    scaffolds, edge_scaffolds = scaffolding(db, mappings)

    # Manual connection of two scaffolds for cen6
    # TODO
    cen6_scaffold = scaffolds[0] + scaffolds[1][db.k - 1:]
    cen6_edge_scaffold = edge_scaffolds[0] + edge_scaffolds[1]

    print('Mapping reads to scaffolds')
    r2s = read2scaffolds(db, [cen6_edge_scaffold], mappings, ec_monostrings)

    print('Covering scaffolds with reads')
    scaf_read_coverage = cover_scaffolds_w_reads(r2s,
                                                 mappings, [cen6_scaffold],
                                                 ec_monostrings,
                                                 k=db.k)

    print('Extracting pseudounits and reads covering them')
    pseudounits, read_pseudounits = \
        extract_read_pseudounits(scaf_read_coverage,
                                 [cen6_scaffold],
                                 monostrings=ec_monostrings)

    print('Reading centromeric reads')
    centromeric_reads = read_bio_seqs(params.centromeric_reads)
    monomers = read_bio_seqs(params.monomers)

    print('Polishing')
    polish(scaffolds=[cen6_scaffold],
           pseudounits=pseudounits,
           read_pseudounits=read_pseudounits,
           reads=centromeric_reads,
           monomers=monomers,
           outdir=os.path.join(params.outdir, 'polishing'),
           n_iter=params.polish_n_iter,
           n_threads=params.polish_n_threads)
Esempio n. 16
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--dbg", required=True,
                        help="Directory with DBG output")
    parser.add_argument("-o", "--outdir", required=True)
    parser.add_argument("--ref")
    parser.add_argument("--refhpc", action='store_true')
    parser.add_argument("--no_export_pdf", action='store_true')
    parser.add_argument("-K", type=int, default=40002)
    params = parser.parse_args()

    params.dbg = expandpath(params.dbg)
    params.outdir = expandpath(params.outdir)
    smart_makedirs(params.outdir)
    logfn = os.path.join(params.outdir, 'inc_k.log')
    global logger
    logger = get_logger(logfn,
                        logger_name='centroFlye: inc_k')
    logger.info(f'cmd: {sys.argv}')
    logger.info(f'git hash: {get_git_revision_short_hash()}')

    db_fn = os.path.join(params.dbg, 'graph.fasta')
    align_fn = os.path.join(params.dbg, 'alignments.txt')
    dbg_log_fn = os.path.join(params.dbg, 'dbg.log')
    with open(dbg_log_fn) as f:
        cmd = f.readline().strip().split(' ')
        i = 0
        while cmd[i] != '-k':
            i += 1
        k = int(cmd[i+1]) + 1
    logger.info(f'init k = {k}')
    logger.info(f'Reading DBG output from {params.dbg}')
    lpdb = PathMultiKGraph.fromDR(db_fn=db_fn, align_fn=align_fn,
                                  k=k, K=params.K)
    logger.info(f'# vertices = {nx.number_of_nodes(lpdb.nx_graph)}')
    logger.info(f'# edges = {nx.number_of_edges(lpdb.nx_graph)}')
    logger.info(f'Finished reading DBG output')
    logger.info(f'Starting increasing k')
    lpdb.transform_fast_until_saturated()
    logger.info(f'Finished increasing k')
    logger.info(f'# vertices = {nx.number_of_nodes(lpdb.nx_graph)}')
    logger.info(f'# edges = {nx.number_of_edges(lpdb.nx_graph)}')

    outac = os.path.join(params.outdir, f'active_connections.txt')
    logger.info(f'Active connections output to {outac}')
    with open(outac, 'w') as f:
        ac = lpdb.idb_mappings.get_active_connections()
        ac = sorted(list(ac))
        for i, j in ac:
            print(f'{i} {j}', file=f)

    outuniquedges = os.path.join(params.outdir, f'unique_edges.txt')
    logger.info(f'Unique edges output to {outuniquedges}')
    with open(outuniquedges, 'w') as f:
        for index in sorted(list(lpdb.unique_edges)):
            print(index, file=f)

    outdot = os.path.join(params.outdir, f'dbg_{k}-{lpdb.init_k+lpdb.niter}')
    logger.info(f'Writing final graph to {outdot}')

    outfasta = outdot + '.fasta'
    logger.info(f'Writing graph edges to {outfasta}')
    edges = {key: ''.join(edge) for key, edge in lpdb.edge2seq.items()}
    write_bio_seqs(outfasta, edges)

    lpdb.write_dot(params.outdir, compact=True,
                   reffn=params.ref, refhpc=params.refhpc, export_pdf=not params.no_export_pdf)
    logger.info(f'Finished writing final graph (dot)')
    out = open(outdot + ".graph", "w")
    for edge in lpdb.nx_graph.edges(keys=True):
        index = lpdb.edge2index[edge]
        seq = lpdb.edge2seq[index]
        out.write(">" + "_".join([str(index), str(edge[0]), str(lpdb.node2len[edge[0]]), str(edge[1]), str(lpdb.node2len[edge[1]])]) + "\n")
        out.write("".join(seq))
        out.write("\n")
    out.close()
Esempio n. 17
0
def polish(scaffolds,
           pseudounits,
           read_pseudounits,
           reads,
           monomers,
           outdir,
           n_iter,
           n_threads,
           flye_bin='flye'):
    def get_template(scaffold, st, en):
        return ''.join(monomers[m_id] for m_id in scaffold[st:en + 1])

    monomers = {
        m_id[0]: monomer
        for m_id, monomer in monomers.items() if m_id[-1] != "'"
    }
    smart_makedirs(outdir)
    for i, (scaffold,
            scaf_pseudounits) in enumerate(zip(scaffolds, pseudounits)):
        scaf_outdir = os.path.join(outdir, f'scaffold_{i}')
        smart_makedirs(scaf_outdir)

        polished_scaffold = []
        for j, (s_st, s_en) in enumerate(scaf_pseudounits):
            pseudounit_outdir = os.path.join(scaf_outdir, f'pseudounit_{j}')
            smart_makedirs(pseudounit_outdir)

            # template = get_template(scaffold, s_st, s_en)
            # template_id = f'scaffold_{i}_template_{j}_{scaffold[s_st:s_en+1]}'
            # write_bio_seqs(template_fn, {template_id: template})

            pseudounit_reads = {}
            for r_id, (r_st, r_en, strand) in read_pseudounits[i][j].items():
                read_segm_id = f's_{i}_t_{j}_{r_id[0]}_{r_st}_{r_en+1}'
                pseudounit_read = reads[r_id[0]][r_st:r_en + 1]
                if strand == '-':
                    pseudounit_read = RC(pseudounit_read)
                pseudounit_reads[read_segm_id] = pseudounit_read
            reads_fn = os.path.join(pseudounit_outdir, 'reads.fasta')
            write_bio_seqs(reads_fn, pseudounit_reads)

            template_fn = os.path.join(pseudounit_outdir, 'template.fasta')
            template_id, template_read = "", None
            r_units_lens = [len(read) for read in pseudounit_reads.values()]
            med_len = statistics.median_high(r_units_lens)
            for r_id in sorted(pseudounit_reads.keys()):
                read = pseudounit_reads[r_id]
                if len(read) == med_len:
                    template_id = r_id
                    template_read = read
                    break
            assert len(pseudounit_reads[template_id]) == med_len
            assert len(template_read) == med_len
            write_bio_seqs(template_fn, {template_id: template_read})

            cmd = [
                flye_bin, '--nano-raw', reads_fn, '--polish-target',
                template_fn, '-i', n_iter, '-t', n_threads, '-o',
                pseudounit_outdir
            ]
            cmd = [str(x) for x in cmd]
            print(' '.join(cmd))
            subprocess.check_call(cmd)

            try:
                polished_pseudounit_fn = \
                    os.path.join(pseudounit_outdir,
                                 f'polished_{n_iter}.fasta')
                polished_pseudounit = read_bio_seq(polished_pseudounit_fn)
                polished_scaffold.append(polished_pseudounit)
            except FileNotFoundError:
                polished_scaffold.append(template)

        polished_scaffold = ''.join(polished_scaffold)
        polished_scaffold_fn = os.path.join(scaf_outdir, f'scaffold_{i}.fasta')
        write_bio_seqs(polished_scaffold_fn,
                       {f'scaffold_{i}_niter_{n_iter}': polished_scaffold})
Esempio n. 18
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--reads",
                        help="Path to centromeric reads in fasta format",
                        required=True)
    parser.add_argument("--repeat",
                        help="Path to the unit sequence",
                        required=True)
    parser.add_argument("-t",
                        "--threads",
                        help="Number of threads",
                        type=int,
                        default=30)
    parser.add_argument("-o",
                        "--outdir",
                        help="Output directory",
                        required=True)
    parser.add_argument("--ncrf-bin",
                        help="Path to binary of NCRF",
                        default='NCRF')
    params = parser.parse_args()
    smart_makedirs(params.outdir)

    repeat = read_bio_seq(params.repeat)

    reads = read_bio_seqs(params.reads)
    reads_split = chunks2(list(reads.keys()), params.threads)
    reads_chunks_fn = {}
    for i in range(len(reads_split)):
        reads_chunk = {k: reads[k] for k in reads_split[i]}
        outdir = os.path.join(params.outdir, 'split_reads')
        smart_makedirs(outdir)
        reads_fn = os.path.join(outdir, f'split_reads_{i}.fasta')
        reads_chunks_fn[i] = reads_fn
        write_bio_seqs(reads_fn, reads_chunk)

    ps = []
    ncrf_reports_fn = []
    for i, fn in reads_chunks_fn.items():
        outdir = os.path.join(params.outdir, 'ncrf_report')
        smart_makedirs(outdir)
        ncrf_report_fn = os.path.join(outdir, f'report_{i}.ncrf')
        with open(ncrf_report_fn, 'w') as f:
            p1 = Popen(['cat', fn], stdout=PIPE)
            p2 = Popen([params.ncrf_bin, f'unit:{repeat}'],
                       stdin=p1.stdout,
                       stdout=f)
            ps.append(p2)
        ncrf_reports_fn.append(ncrf_report_fn)
    for p in ps:
        p.wait()

    final_report_fn = os.path.join(params.outdir, 'report.ncrf')
    with open(final_report_fn, 'w') as f:
        cmd1 = ['cat'] + ncrf_reports_fn
        p1 = Popen(cmd1, stdout=PIPE)
        cmd2 = f"grep -v -E end-of-file".split(' ')
        p2 = Popen(cmd2, stdin=p1.stdout, stdout=f)
        p2.wait()

    cmd = f'sed -i s/unit/{repeat}/g {final_report_fn}'
    call(cmd.split(' '))
Esempio n. 19
0
    def map_strings(self,
                    string_set,
                    overlap_penalty,
                    neutral_symbs,
                    only_unique_paths=False,
                    outdir=None,
                    n_threads=config['common']['threads'],
                    min_len=None):

        logger.info('Mapping monostrings to graph')
        logger.info('Computing overlaps')
        if min_len is None:
            logger.info('No min len parameter. All strings will be aligned')
        else:
            logger.info(f'Only strings longer than {min_len} will be aligned')
            total_reads = len(string_set)
            string_set = {
                s_id: string
                for s_id, string in string_set.items()
                if len(string) >= min_len
            }
            long_reads = len(string_set)
            logger.info(f'{long_reads} / {total_reads} longer than {min_len}')

        overlaps, excessive_overlaps = \
            find_overlaps(graph=self,
                          string_set=string_set,
                          overlap_penalty=overlap_penalty,
                          neutral_symbs=neutral_symbs,
                          n_threads=n_threads)
        # print(overlaps)

        logger.info('Computing chains')
        chains = get_chains(graph=self, overlaps=overlaps, n_threads=n_threads)

        unmapped = {
            s_id
            for s_id, s_chains in chains.items() if len(s_chains) == 0
        }
        logger.info(f'{len(unmapped)} strings are unmapped')
        logger.info(f'That includes {len(excessive_overlaps)} reads '
                    f'with too many overlaps (see config)')

        unique_mapping = {
            s_id
            for s_id, s_chains in chains.items() if len(s_chains) == 1
        }
        logger.info(f'{len(unique_mapping)} strings are uniquely mapped')

        if outdir is not None:
            smart_makedirs(outdir)

            unmapped_fn = os.path.join(outdir, 'unmapped.txt')
            with open(unmapped_fn, 'w') as f:
                for s_id in unmapped:
                    print(s_id, file=f)

            excessive_fn = os.path.join(outdir, 'excessive.txt')
            with open(excessive_fn, 'w') as f:
                print('s_id', '#overlaps', file=f)
                for s_id, s_overlaps in excessive_overlaps.items():
                    print(s_id, len(s_overlaps), file=f)

            chains_fn = os.path.join(outdir, 'chains.txt')
            with open(chains_fn, 'w') as f:
                for s_id, s_chains in chains.items():
                    for chain in s_chains:
                        print(s_id, chain, file=f)

            unique_fn = os.path.join(outdir, 'unique.txt')
            with open(unique_fn, 'w') as f:
                for s_id in unique_mapping:
                    print(s_id, file=f)

        paths = defaultdict(list)
        for r_id, chains_r_id in chains.items():
            for chain in chains_r_id:
                path = [overlap.edge for overlap in chain.overlap_list]
                e_st = chain.overlap_list[0].e_st
                e_en = chain.overlap_list[-1].e_en
                paths[r_id].append((path, e_st, e_en))
        if only_unique_paths:
            paths = {
                r_id: paths_r_id[0]
                for r_id, paths_r_id in paths.items() if len(paths_r_id) == 1
            }
        return paths