def get_contigs_info(contigs_file): contigs_info = {} contigs_fasta = fp.read_fasta_dict(contigs_file) for ctg_id, ctg_seq in contigs_fasta.iteritems(): contig_type = ctg_id.split("_")[0] contigs_info[ctg_id] = ContigInfo(ctg_id, len(ctg_seq), contig_type) return contigs_info
def get_bubbles(alignment_path, contigs_info, contigs_path, err_mode, num_proc, min_alignment): """ The main function: takes an alignment and returns bubbles """ aln_reader = SynchronizedSamReader(alignment_path, fp.read_fasta_dict(contigs_path), min_alignment) manager = multiprocessing.Manager() results_queue = manager.Queue() error_queue = manager.Queue() #making sure the main process catches SIGINT orig_sigint = signal.signal(signal.SIGINT, signal.SIG_IGN) threads = [] for _ in xrange(num_proc): threads.append(multiprocessing.Process(target=_thread_worker, args=(aln_reader, contigs_info, err_mode, results_queue, error_queue))) signal.signal(signal.SIGINT, orig_sigint) for t in threads: t.start() try: for t in threads: t.join() except KeyboardInterrupt: for t in threads: t.terminate() if not error_queue.empty(): raise error_queue.get() bubbles = [] total_long_bubbles = 0 total_long_branches = 0 total_empty = 0 total_aln_errors = [] while not results_queue.empty(): (ctg_bubbles, num_long_bubbles, num_empty, num_long_branch, aln_errors) = results_queue.get() total_long_bubbles += num_long_bubbles total_long_branches += num_long_branch total_empty += num_empty total_aln_errors.extend(aln_errors) bubbles.extend(ctg_bubbles) mean_aln_error = float(sum(total_aln_errors)) / (len(total_aln_errors) + 1) logger.debug("Alignment error rate: {0}".format(mean_aln_error)) logger.debug("Generated {0} bubbles".format(len(bubbles))) logger.debug("Split {0} long bubbles".format(total_long_bubbles)) logger.debug("Skipped {0} empty bubbles".format(total_empty)) logger.debug("Skipped {0} bubbles with long branches".format(total_long_branches)) return bubbles
def run(self): logger.info("Polishing genome ({0}/{1})".format( self.stage_id, self.args.num_iters)) contigs_fasta = fp.read_fasta_dict(self.in_reference) reference_file = os.path.join( self.work_dir, "blasr_ref_{0}.fasta".format(self.stage_id)) aln.make_blasr_reference(contigs_fasta, reference_file) aln.make_alignment(reference_file, self.args.reads, self.args.threads, self.out_alignment) os.remove(reference_file) Job.run_description["stage_name"] = self.name Job.run_description["stage_id"] = self.stage_id
def patch_genome(alignment, reference_file, out_patched): aln_by_ctg = defaultdict(list) for aln in alignment: aln_by_ctg[aln.trg_id].append(aln) ref_fasta = fp.read_fasta_dict(reference_file) fixed_fasta = {} for ctg_id, ctg_aln in aln_by_ctg.iteritems(): patches = _get_patching_alignmemnts(ctg_aln) fixed_sequence = _apply_patches(patches, ref_fasta[ctg_id]) fixed_fasta[ctg_id] = fixed_sequence fp.write_fasta_dict(fixed_fasta, out_patched)
def get_consensus(alignment_path, contigs_path, contigs_info, min_aln_length, platform, num_proc): """ Main function """ aln_reader = SynchronizedSamReader(alignment_path, fp.read_fasta_dict(contigs_path), min_aln_length) manager = multiprocessing.Manager() results_queue = manager.Queue() error_queue = manager.Queue() #making sure the main process catches SIGINT orig_sigint = signal.signal(signal.SIGINT, signal.SIG_IGN) threads = [] for _ in xrange(num_proc): threads.append( multiprocessing.Process(target=_thread_worker, args=(aln_reader, contigs_info, platform, results_queue, error_queue))) signal.signal(signal.SIGINT, orig_sigint) for t in threads: t.start() try: for t in threads: t.join() except KeyboardInterrupt: for t in threads: t.terminate() if not error_queue.empty(): raise error_queue.get() out_fasta = {} total_aln_errors = [] while not results_queue.empty(): ctg_id, ctg_seq, aln_errors = results_queue.get() total_aln_errors.extend(aln_errors) if len(ctg_seq) > 0: out_fasta[ctg_id] = ctg_seq mean_aln_error = float(sum(total_aln_errors)) / (len(total_aln_errors) + 1) logger.debug("Alignment error rate: {0}".format(mean_aln_error)) return out_fasta
def concatenate_contigs(contigs_file): """ Concatenates contig parts output by assembly module """ genome_framents = fp.read_fasta_dict(contigs_file) contig_types = {} by_contig = defaultdict(list) for h, s in genome_framents.iteritems(): tokens = h.split("_") cont_type = tokens[0] contig_id = tokens[0] + "_" + tokens[1] part_id = int(tokens[3]) contig_types[contig_id] = cont_type by_contig[contig_id].append((part_id, s)) contigs_fasta = {} for contig_id, contig_seqs in by_contig.iteritems(): seqs_sorted = sorted(contig_seqs, key=lambda p: p[0]) contig_concat = "".join(map(lambda p: p[1], seqs_sorted)) contig_len = len(contig_concat) contigs_fasta[contig_id] = contig_concat return contigs_fasta