def _run_polisher_only(args): """ Runs standalone polisher """ logger.info("Running Flye polisher") logger.debug("Cmd: %s", " ".join(sys.argv)) bam_input = False for read_file in args.reads: if not os.path.exists(read_file): raise ResumeException("Can't open " + read_file) without_gz = read_file.rstrip(".gz") if not any([ without_gz.endswith(x) for x in ["fasta", "fa", "fastq", "fq", "bam"] ]): raise ResumeException( "Unsupported input. Supported types: fasta/fastq/bam") if without_gz.endswith("bam"): bam_input = True if bam_input and len(args.reads) > 1: raise ResumeException("Only single bam input supported") pol.polish(args.polish_target, args.reads, args.out_dir, args.num_iters, args.threads, args.platform, args.read_type, output_progress=True) logger.info("Done!")
def _run_polisher_only(args): """ Runs standalone polisher """ logger.info("Running Flye polisher") logger.debug("Cmd: {0}".format(" ".join(sys.argv))) pol.polish(args.polish_target, args.reads, args.out_dir, args.num_iters, args.threads, args.platform, output_progress=True)
def run(self): super(JobPolishing, self).run() if not os.path.isdir(self.polishing_dir): os.mkdir(self.polishing_dir) pol.polish(self.in_contigs, self.args.reads, self.polishing_dir, self.args.num_iters, self.args.threads, self.args.platform, output_progress=True) polished_file = os.path.join(self.polishing_dir, "polished_{0}.fasta" .format(self.args.num_iters)) pol.generate_polished_edges(self.in_graph_edges, self.in_graph_gfa, polished_file, self.polishing_dir, self.args.platform, self.args.threads)
def run(self): super(JobPolishing, self).run() if not os.path.isdir(self.polishing_dir): os.mkdir(self.polishing_dir) contigs, stats = \ pol.polish(self.in_contigs, self.args.reads, self.polishing_dir, self.args.num_iters, self.args.threads, self.args.platform, output_progress=True) #contigs = os.path.join(self.polishing_dir, "polished_1.fasta") #stats = os.path.join(self.polishing_dir, "contigs_stats.txt") pol.filter_by_coverage(self.args, stats, contigs, self.out_files["stats"], self.out_files["contigs"]) pol.generate_polished_edges(self.in_graph_edges, self.in_graph_gfa, self.out_files["contigs"], self.polishing_dir, self.args.platform, self.args.threads) os.remove(contigs)
def run(self): super(JobPolishing, self).run() if not os.path.isdir(self.polishing_dir): os.mkdir(self.polishing_dir) contigs, stats = \ pol.polish(self.in_contigs, self.args.reads, self.polishing_dir, self.args.num_iters, self.args.threads, self.args.platform, self.args.read_type, output_progress=True) #contigs = os.path.join(self.polishing_dir, "polished_1.fasta") #stats = os.path.join(self.polishing_dir, "contigs_stats.txt") pol.filter_by_coverage(self.args, stats, contigs, self.out_files["stats"], self.out_files["contigs"]) pol.generate_polished_edges(self.in_graph_edges, self.in_graph_gfa, self.out_files["contigs"], self.polishing_dir, self.args.platform, stats, self.args.threads) os.remove(contigs) if os.path.getsize(self.out_files["contigs"]) == 0: raise asm.AssembleException("No contigs were assembled - " "pipeline stopped")
def assemble_short_plasmids(args, work_dir, contigs_path): logger.debug("Extracting unmapped reads") reads2contigs_mapping = os.path.join(work_dir, "reads2contigs.paf") make_alignment(contigs_path, args.reads, args.threads, work_dir, args.platform, reads2contigs_mapping, reference_mode=True, sam_output=False) unmapped_reads_path = os.path.join(work_dir, "unmapped_reads.fasta") unmapped.extract_unmapped_reads(args, reads2contigs_mapping, unmapped_reads_path, mapping_rate_threshold=0.5) logger.debug("Finding self-mappings for unmapped reads") unmapped_reads_mapping = os.path.join(work_dir, "unmapped_ava.paf") make_alignment(unmapped_reads_path, [unmapped_reads_path], args.threads, work_dir, args.platform, unmapped_reads_mapping, reference_mode=False, sam_output=False) logger.debug("Extracting circular reads") circular_reads = circular.extract_circular_reads(unmapped_reads_mapping) logger.debug("Extracted %d circular reads", len(circular_reads)) logger.debug("Extracing circular pairs") circular_pairs = circular.extract_circular_pairs(unmapped_reads_mapping) logger.debug("Extracted %d circular pairs", len(circular_pairs)) #extracting only the necesssary subset of reads (the entire file could be pretty big) interesting_reads = {} for read in circular_reads: interesting_reads[read] = None for pair in circular_pairs: interesting_reads[pair[0].query] = None interesting_reads[pair[0].target] = None for hdr, seq in fp.stream_sequence(unmapped_reads_path): if hdr in interesting_reads: interesting_reads[hdr] = seq trimmed_circular_reads = \ circular.trim_circular_reads(circular_reads, interesting_reads) trimmed_circular_pairs = \ circular.trim_circular_pairs(circular_pairs, interesting_reads) trimmed_sequences_path = os.path.join(work_dir, "trimmed_sequences.fasta") fp.write_fasta_dict(dict(list(trimmed_circular_reads.items()) + list(trimmed_circular_pairs.items())), trimmed_sequences_path) logger.debug("Clustering circular sequences") trimmed_sequences_mapping = os.path.join(work_dir, "trimmed.paf") make_alignment(trimmed_sequences_path, [trimmed_sequences_path], args.threads, work_dir, args.platform, trimmed_sequences_mapping, reference_mode=False, sam_output=False) plasmids = \ circular.extract_unique_plasmids(trimmed_sequences_mapping, trimmed_sequences_path) plasmids_raw = os.path.join(work_dir, "plasmids_raw.fasta") fp.write_fasta_dict(plasmids, plasmids_raw) _, polished_stats = \ pol.polish(plasmids_raw, [unmapped_reads_path], work_dir, 1, args.threads, args.platform, output_progress=False) #extract coverage plasmids_with_coverage = {} if os.path.isfile(polished_stats): with open(polished_stats, "r") as f: for line in f: if line.startswith("#"): continue tokens = line.strip().split() seq_id, coverage = tokens[0], int(tokens[2]) if coverage > 0: plasmids_with_coverage[seq_id] = plasmids[seq_id], coverage logger.info("Added %d extra contigs", len(plasmids_with_coverage)) # remove all unnecesarry files os.remove(reads2contigs_mapping) os.remove(unmapped_reads_path) os.remove(unmapped_reads_mapping) os.remove(trimmed_sequences_path) os.remove(trimmed_sequences_mapping) return plasmids_with_coverage
def assemble_short_plasmids(args, work_dir, contigs_path): logger.debug("Assembling short plasmids") reads2contigs_mapping = os.path.join(work_dir, "reads2contigs.paf") make_alignment(contigs_path, args.reads, args.threads, work_dir, args.platform, reads2contigs_mapping, reference_mode=True, sam_output=False) logger.debug("Extracting unmapped reads") unmapped_reads, n_processed_reads = \ unmapped.extract_unmapped_reads(args, reads2contigs_mapping, mapping_rate_threshold=0.5) n_unmapped_reads = len(unmapped_reads) unmapped_reads_ratio = 100 * float(len(unmapped_reads)) / n_processed_reads unmapped_reads_ratio = round(unmapped_reads_ratio, 1) logger.debug("Extracted {} unmapped reads ({} %)".format( n_unmapped_reads, unmapped_reads_ratio)) unmapped_reads_path = os.path.join(work_dir, "unmapped_reads.fasta") fp.write_fasta_dict(unmapped_reads, unmapped_reads_path) unmapped_reads_mapping = os.path.join(work_dir, "unmapped_ava.paf") logger.debug("Finding self-mappings for unmapped reads") make_alignment(unmapped_reads_path, [unmapped_reads_path], args.threads, work_dir, args.platform, unmapped_reads_mapping, reference_mode=False, sam_output=False) logger.debug("Extracting circular reads") circular_reads = circular.extract_circular_reads(unmapped_reads_mapping) logger.debug("Extracted {} circular reads".format(len(circular_reads))) logger.debug("Extracing circular pairs") circular_pairs = circular.extract_circular_pairs(unmapped_reads_mapping) logger.debug("Extracted {} circular pairs".format(len(circular_pairs))) logger.debug("Extracting unique plasmids from circular sequences") trimmed_circular_reads = \ circular.trim_circular_reads(circular_reads, unmapped_reads) trimmed_circular_pairs = \ circular.trim_circular_pairs(circular_pairs, unmapped_reads) trimmed_sequences_path = os.path.join(work_dir, "trimmed_sequences.fasta") fp.write_fasta_dict( dict(trimmed_circular_reads.items() + trimmed_circular_pairs.items()), trimmed_sequences_path) trimmed_sequences_mapping = os.path.join(work_dir, "trimmed.paf") make_alignment(trimmed_sequences_path, [trimmed_sequences_path], args.threads, work_dir, args.platform, trimmed_sequences_mapping, reference_mode=False, sam_output=False) plasmids = \ circular.extract_unique_plasmids(trimmed_sequences_mapping, trimmed_sequences_path) plasmids_raw = os.path.join(work_dir, "plasmids_raw.fasta") fp.write_fasta_dict(plasmids, plasmids_raw) pol.polish(plasmids_raw, [unmapped_reads_path], work_dir, 1, args.threads, args.platform, output_progress=False) #extract coverage plasmids_with_coverage = {} if os.path.isfile(os.path.join(work_dir, "contigs_stats.txt")): with open(os.path.join(work_dir, "contigs_stats.txt"), "r") as f: for line in f: if line.startswith("seq"): continue tokens = line.strip().split() seq_id, coverage = tokens[0], int(tokens[2]) if coverage > 0: plasmids_with_coverage[seq_id] = plasmids[seq_id], coverage logger.info("Added {} extra contigs".format(len(plasmids_with_coverage))) # remove all unnecesarry files os.remove(reads2contigs_mapping) os.remove(unmapped_reads_path) os.remove(unmapped_reads_mapping) os.remove(trimmed_sequences_path) os.remove(trimmed_sequences_mapping) return plasmids_with_coverage