def filter_unsync(overlaps_fname: str, min_n_ovlp: int,
                  default_min_ovlp_len: int, limit_min_ovlp_len: int,
                  contained_removal: bool, out_fname: str):
    overlaps = load_pickle(overlaps_fname)
    # Remove short and/or noisy overlaps while keeping overlaps around putative
    # low coverage regions
    filtered_overlaps = adaptive_filter_overlaps(overlaps, min_n_ovlp,
                                                 default_min_ovlp_len,
                                                 limit_min_ovlp_len,
                                                 contained_removal)
    filtered_overlaps = best_overlaps_per_pair(filtered_overlaps, by="diff")
    # NOTE: By removing contained reads, you can save much time in overlap
    #       computation, although accuracy might decrease.
    if contained_removal:
        filtered_overlaps = remove_contained_reads(filtered_overlaps)
    save_pickle(filtered_overlaps, out_fname)
 def run(self):
     labeled_reads = run_distribute(
         func=run_smdc_multi,
         args=load_pickle(self.sync_reads_fname),
         shared_args=dict(th_ward=self.th_ward,
                          alpha=self.alpha,
                          p_error=self.p_error,
                          split_init_how=self.split_init_how,
                          merge_how=self.merge_how),
         scheduler=self.scheduler,
         n_distribute=self.n_distribute,
         n_core=self.n_core,
         max_cpu_hour=self.max_cpu_hour,
         max_mem_gb=self.max_mem_gb,
         tmp_dname=self.tmp_dname,
         job_name="smdc_ovlp",
         out_fname=self.out_fname,
         log_level="debug" if self.verbose else "info")
     save_pickle(labeled_reads, self.out_fname)
Beispiel #3
0
def load_qv(reads_fname: str,
            fastq_fname: Optional[str] = None,
            mean_qv: Optional[int] = None):
    def to_ccs_name(read_name: str) -> str:
        pre, mid, _ = read_name.split('/')
        return f"{pre}/{mid}/ccs"

    assert not (fastq_fname is None and mean_qv is None), \
        "One of `fastq_fname` or `mean_qv` must be specified."
    reads = load_pickle(reads_fname)
    if fastq_fname is not None:
        reads_fastq_by_name = {
            read.name: read
            for read in load_fastq(fastq_fname)
        }
        for read in reads:
            read.qual = reads_fastq_by_name[to_ccs_name(read.name)].qual_phred
    else:
        for read in reads:
            read.qual = np.full(read.length, mean_qv, dtype=np.int8)
    save_pickle(reads, reads_fname)
Beispiel #4
0
 def filter_read_ids(self) -> List[int]:
     overlaps = load_pickle(self.overlaps_fname)
     n_ovlps_per_read = Counter()
     for o in overlaps:
         n_ovlps_per_read[o.a_read_id] += 1
         n_ovlps_per_read[o.b_read_id] += 1
     read_ids = set(n_ovlps_per_read.keys())
     # NOTE: "80-80" rule for global mode
     #       i.e. if >80% of reads involved in the overlaps overlap to
     #       >80% of the reads, then run in a global mode.
     if (len(
             list(
                 filter(lambda c: c >= 0.8 * len(read_ids),
                        list(n_ovlps_per_read.values())))) >=
             0.8 * len(read_ids)):  # global mode
         # Pick up a single read that appears most frequently in overlaps
         logger.info("Run in global mode")
         return [n_ovlps_per_read.most_common()[0][0]]
     else:  # local mode
         # Merge reads that have the same set of overlapping reads
         filtered_read_ids = []
         added_read_id_set = set()
         for read_id in sorted(read_ids):
             read_id_set = {read_id}
             for o in overlaps:
                 if o.a_read_id == read_id:
                     read_id_set.add(o.b_read_id)
                 elif o.b_read_id == read_id:
                     read_id_set.add(o.a_read_id)
             read_id_set = tuple(sorted(read_id_set))
             if read_id_set not in added_read_id_set:
                 filtered_read_ids.append(read_id)
                 added_read_id_set.add(read_id_set)
         logger.info("Read IDs for synchronization: "
                     f"{len(read_ids)} -> {len(filtered_read_ids)}")
         return filtered_read_ids
Beispiel #5
0
 def run(self):
     sync_reads = load_pickle(self.sync_reads_fname)
     assert (isinstance(sync_reads, list)
             and isinstance(sync_reads[0], tuple)), \
         "`sync_reads_fname` must contain `List[Tuple[int, int, List[TRRead]]]`"
     assert all([
         read.synchronized for _, _, reads in sync_reads for read in reads
     ]), "Synchronize units first"
     overlaps = run_distribute(
         func=ava_sync,
         args=sync_reads,
         shared_args=dict(max_units_diff=self.max_units_diff,
                          max_seq_diff=self.max_seq_diff),
         scheduler=self.scheduler,
         n_distribute=self.n_distribute,
         n_core=self.n_core,
         max_cpu_hour=self.max_cpu_hour,
         max_mem_gb=self.max_mem_gb,
         tmp_dname=self.tmp_dname,
         job_name="ava_sync",
         out_fname=self.out_fname,
         log_level="debug" if self.verbose else "info")
     save_pickle(sorted(reduce_same_overlaps(list(set(overlaps)))),
                 self.out_fname)
def main():
    config = parse_args()
    tasks = config["tasks"]
    if "extract" in config["tasks"]:
        tasks |= EXTRACT
    if "filter" in config["tasks"]:
        tasks |= FILTER
    if "assemble" in config["tasks"]:
        tasks |= ASSEMBLE
    if "all" in config["tasks"]:
        tasks |= ALL
    scheduler = (Scheduler(**config["job_scheduler"]["args"])
                 if config["job_scheduler"]["use_scheduler"] else None)
    # Run specified tasks
    if "fasta_to_db" in tasks and config["extract"]["from_fasta"]:
        fasta_to_db(db_prefix=config["extract"]["db_prefix"],
                    db_type=config["extract"]["db_suffix"],
                    **config["extract"]["fasta_to_db"])
    if "datander" in tasks:
        # Detect tandem repeats
        DatanderRunner(db_prefix=config["extract"]["db_prefix"],
                       db_suffix=config["extract"]["db_suffix"],
                       scheduler=scheduler,
                       **config["extract"]["tr_detection"]).run()
    if "datruf" in tasks:
        # Detect tandem repeat units
        DatrufRunner(
            db_fname=
            f"{config['extract']['db_prefix']}.{config['extract']['db_suffix']}",
            las_fname=f"TAN.{config['extract']['db_prefix']}.las",
            scheduler=scheduler,
            verbose=config["verbose"],
            **config["extract"]["unit_detection"]).run()
    if "load_qv" in tasks and "load_qv" in config["extract"]:
        # Load QV data
        load_qv(reads_fname=config["extract"]["unit_detection"]["out_fname"],
                **config["extract"]["load_qv"])
    # TODO: Visualize unit length distribution (output an html file?)
    if "filter_reads" in tasks:
        # Filter reads having units you want to assemble
        filter_reads(
            reads_fname=config["extract"]["unit_detection"]["out_fname"],
            **config["filter"])
    if "unsync_overlap" in tasks:
        # Compute overlaps between unsynchronized TR reads
        UnsyncReadsOverlapper(reads_fname=config["assemble"]["reads_fname"],
                              scheduler=scheduler,
                              verbose=config["verbose"],
                              **config["assemble"]["unsync_overlap"]).run()
    if "unsync_filter" in tasks:
        # Adaptively filter unsynchronized overlaps
        filter_unsync(
            overlaps_fname=config["assemble"]["unsync_overlap"]["out_fname"],
            **config["assemble"]["unsync_filter"])
    if "sync" in tasks:
        # Syncronize TR reads
        ReadSynchronizer(
            reads_fname=config["assemble"]["reads_fname"],
            overlaps_fname=config["assemble"]["unsync_filter"]["out_fname"],
            scheduler=scheduler,
            verbose=config["verbose"],
            **config["assemble"]["unit_sync"]).run()
    if "smdc" in tasks:
        # Correct errors in units via clustering
        SplitMergeDpmmOverlapper(
            sync_reads_fname=config["assemble"]["unit_sync"]["out_fname"],
            scheduler=scheduler,
            verbose=config["verbose"],
            **config["assemble"]["unit_clustering"]).run()
    if "sync_overlap" in tasks:
        # Compute overlaps between synchronized TR reads with corrected units
        SyncReadsOverlapper(sync_reads_fname=config["assemble"]
                            ["unit_clustering"]["out_fname"],
                            scheduler=scheduler,
                            verbose=config["verbose"],
                            **config["assemble"]["sync_overlap"]).run()
    if "sync_filter" in tasks:
        # Adaptively filter synchronized overlaps
        filter_sync(
            overlaps_fname=config["assemble"]["sync_overlap"]["out_fname"],
            **config["assemble"]["sync_filter"])
    if "contig" in tasks:
        # Construct a string graph
        overlaps = load_pickle(config["assemble"]["sync_filter"]["out_fname"])
        sg = overlaps_to_string_graph(overlaps)
        sg_ccs = reduce_graph(sg)
        # Generate contigs
        reads = load_pickle(config["assemble"]["reads_fname"])
        reads_by_id = {read.id: read for read in reads}
        contigs = graphs_to_contigs(
            sg_ccs,
            overlaps,
            reads_by_id,
            n_core=config["assemble"]["layout"]["n_core"])
        save_fasta(contigs, config["assemble"]["layout"]["out_fname"])