def _compose_sequence(consensus_file): """ Concatenates bubbles consensuses into genome """ consensuses = defaultdict(list) coverage = defaultdict(list) with open(consensus_file, "r") as f: header = True for line in f: if header: tokens = line.strip().split(" ") if len(tokens) != 4: raise Exception("Bubble format error") ctg_id = tokens[0][1:] ctg_pos = int(tokens[1]) #coverage[ctg_id].append(int(tokens[2])) ctg_sub_pos = int(tokens[3]) else: consensuses[ctg_id].append( (ctg_pos, ctg_sub_pos, line.strip())) header = not header polished_fasta = {} polished_stats = {} for ctg_id, seqs in iteritems(consensuses): sorted_seqs = [p[2] for p in sorted(seqs, key=lambda p: (p[0], p[1]))] concat_seq = "".join(sorted_seqs) #mean_coverage = sum(coverage[ctg_id]) / len(coverage[ctg_id]) polished_fasta[ctg_id] = concat_seq polished_stats[ctg_id] = len(concat_seq) return polished_fasta, polished_stats
def extract_unmapped_reads(args, reads2contigs_mapping, unmapped_reads_path, mapping_rate_threshold): mapping_rates = calc_mapping_rates(reads2contigs_mapping) total_bases = 0 unmapped_bases = 0 with open(unmapped_reads_path, "w") as fout: for reads_file in args.reads: for hdr, sequence in fp.stream_sequence(reads_file): total_bases += len(sequence) is_unmapped = True contigs = mapping_rates.get(hdr) if contigs is not None: is_unmapped = True for _, mapping_rate in iteritems(contigs): if mapping_rate >= mapping_rate_threshold: is_unmapped = False if is_unmapped: unmapped_bases += len(sequence) fout.write(">{0}\n{1}\n".format(hdr, sequence)) logger.debug("Unmapped sequence: %d / %d (%f)", unmapped_bases, total_bases, unmapped_bases / total_bases)
def get_contigs_info(contigs_file): contigs_info = {} contigs_fasta = fp.read_sequence_dict(contigs_file) for ctg_id, ctg_seq in iteritems(contigs_fasta): contig_type = ctg_id.split("_")[0] contigs_info[ctg_id] = ContigInfo(ctg_id, len(ctg_seq), contig_type) return contigs_info
def trim_circular_reads(circular_reads, unmapped_reads): trimmed_circular_reads = dict() for i, (read, hit) in enumerate(iteritems(circular_reads)): sequence = unmapped_reads[read][:hit.target_start].upper() trimmed_circular_reads["circular_read_" + str(i)] = sequence return trimmed_circular_reads
def split_into_chunks(fasta_in, chunk_size): out_dict = {} for header, seq in iteritems(fasta_in): #print len(seq) for i in range(0, max(len(seq) // chunk_size, 1)): chunk_hdr = "{0}$chunk_{1}".format(header, i) start = i * chunk_size end = (i + 1) * chunk_size if len(seq) - end < chunk_size: end = len(seq) #print(start, end) out_dict[chunk_hdr] = seq[start : end] return out_dict
def __init__(self, sam_alignment, reference_fasta, multiproc_manager, max_coverage=None, use_secondary=False): #check that alignment exists if not os.path.exists(sam_alignment): raise AlignmentException("Can't open {0}".format(sam_alignment)) if not os.path.exists(sam_alignment + ".bai"): raise AlignmentException("Bam not indexed: {0}".format(sam_alignment)) #will not be changed during exceution, each process has its own copy self.aln_path = sam_alignment self.max_coverage = max_coverage self.use_secondary = use_secondary self.cigar_parser = re.compile(b"[0-9]+[MIDNSHP=X]") #self.shared_manager = multiprocessing.Manager() self.ref_fasta = multiproc_manager.dict() for (h, s) in iteritems(reference_fasta): self.ref_fasta[_BYTES(h)] = _BYTES(s)
def __init__(self, sam_alignment, reference_fasta, max_coverage=None, use_secondary=False): #check that alignment exists if not os.path.exists(sam_alignment): raise AlignmentException("Can't open {0}".format(sam_alignment)) #will not be changed during exceution, each process has its own copy self.aln_path = sam_alignment self.ref_fasta = { _BYTES(h): _BYTES(s) for (h, s) in iteritems(reference_fasta) } self.change_strand = True self.max_coverage = max_coverage self.use_secondary = use_secondary self.cigar_parser = re.compile(b"[0-9]+[MIDNSHP=X]") #will be shared between processes self.shared_manager = multiprocessing.Manager() self.shared_reader_queue = self.shared_manager.Queue() self.shared_num_jobs = multiprocessing.Value(ctypes.c_int, 0) self.shared_lock = self.shared_manager.Lock() self.shared_eof = multiprocessing.Value(ctypes.c_bool, False) #specific to IO thread self.io_thread = None self.terminate_flag = False self.processed_contigs = set() self.chunk_buffer = [] self.current_contig = None #start IO thread self.io_thread = \ multiprocessing.Process(target=SynchronizedSamReader._io_thread_worker, args=(self,)) self.io_thread.start()
def __init__(self, sam_alignment, reference_fasta, max_coverage=None, use_secondary=False): #will not be changed during exceution, each process has its own copy self.aln_path = sam_alignment self.aln_file = None self.ref_fasta = {_BYTES(h) : _BYTES(s) for (h, s) in iteritems(reference_fasta)} self.change_strand = True self.max_coverage = max_coverage self.seq_lengths = {} self.use_secondary = use_secondary self.cigar_parser = None self.processed_contigs = None #reading SAM header if not os.path.exists(self.aln_path): raise AlignmentException("Can't open {0}".format(self.aln_path)) with open(self.aln_path, "rb") as f: for line in f: if not line or not _is_sam_header(line): break if line.startswith(b"@SQ"): seq_name = None seq_len = None for tag in line.split(): if tag.startswith(b"SN"): seq_name = tag[3:] if tag.startswith(b"LN"): seq_len = int(tag[3:]) if seq_name and seq_len: self.seq_lengths[_STR(seq_name)] = seq_len #will be shared between processes self.lock = multiprocessing.Lock() self.eof = multiprocessing.Value(ctypes.c_bool, False) self.position = multiprocessing.Value(ctypes.c_longlong, 0)
def dump_repeats(repeats_info, filename): with open(filename, "w") as f: for repeat_id, info in iteritems(repeats_info): f.write("#Repeat {0}\t{1}\n\n".format(repeat_id, info.multiplicity)) f.write("#All reads\t{0}\n".format(len(info.all_reads))) for read in info.all_reads: f.write(read + "\n") f.write("\n") for in_edge in info.in_reads: f.write("#Input {0}\t{1}\n".format( in_edge, len(info.in_reads[in_edge]))) for read in info.in_reads[in_edge]: f.write(read + "\n") f.write("\n") for out_edge in info.out_reads: f.write("#Output {0}\t{1}\n".format( out_edge, len(info.out_reads[out_edge]))) for read in info.out_reads[out_edge]: f.write(read + "\n") f.write("\n")
def generate_stats(repeat_file, polished_file, scaffolds, out_stats): """ Compiles information from multiple stages """ #contigs_length = {} #contigs_coverage = {} contigs_stats = {} header_line = "#seq_name\tlength\tcov.\tcirc.\trepeat\tmult.\tgraph_path" for line in open(repeat_file, "r"): if line.startswith("#"): continue tokens = line.strip().split("\t") contigs_stats[tokens[0]] = SeqStats(*tokens) #if polished_file is None: #contigs_length[tokens[0]] = int(tokens[1]) #contigs_coverage[tokens[0]] = int(tokens[2]) if polished_file is not None: for line in open(polished_file, "r"): if line.startswith("#"): continue tokens = line.strip().split("\t") contigs_stats[tokens[0]].length = tokens[1] contigs_stats[tokens[0]].coverage = tokens[2] scaffolds_stats = {} for scf, scf_seq in iteritems(scaffolds): scaffolds_stats[scf] = SeqStats(scf) scf_length = sum( [int(contigs_stats[unsigned(c)].length) for c in scf_seq]) scf_length += (len(scf_seq) - 1) * cfg.vals["scaffold_gap"] scaffolds_stats[scf].length = str(scf_length) scf_cov = _mean( [int(contigs_stats[unsigned(c)].coverage) for c in scf_seq]) scaffolds_stats[scf].coverage = str(scf_cov) scaffolds_stats[scf].repeat = contigs_stats[unsigned( scf_seq[0])].repeat scaffolds_stats[scf].circular = contigs_stats[unsigned( scf_seq[0])].circular scf_mult = min([int(contigs_stats[unsigned(c)].mult) for c in scf_seq]) scaffolds_stats[scf].mult = str(scf_mult) #telomere information telomere_left = contigs_stats[unsigned(scf_seq[0])].telomere telomere_right = contigs_stats[unsigned(scf_seq[-1])].telomere if scf_seq[0][0] == "+": scf_left = telomere_left in ["left", "both"] else: scf_left = telomere_left in ["right", "both"] if scf_seq[-1][0] == "+": scf_right = telomere_right in ["right", "both"] else: scf_right = telomere_right in ["left", "both"] #if scf_left and scf_right: scaffolds_stats[scf].telomere = "both" #elif scf_left and not scf_right: scaffolds_stats[scf].telomere = "left" #elif not scf_left and scf_right: scaffolds_stats[scf].telomere = "right" #else: scaffolds_stats[scf].telomere = "none" #graph path path = [] for ctg in scf_seq: ctg_path = contigs_stats[unsigned(ctg)].graph_path if ctg[0] == "-": ctg_path = ",".join( [str(-int(x)) for x in ctg_path.split(",")][::-1]) path.append(ctg_path) prefix = "*," if scf_left else "" suffix = ",*" if scf_right else "" scaffolds_stats[scf].graph_path = prefix + ",??,".join(path) + suffix with open(out_stats, "w") as f: f.write(header_line + "\n") for scf in sorted(scaffolds_stats, key=lambda x: int(x.rsplit("_", 1)[-1])): scaffolds_stats[scf].print_out(f) total_length = sum([int(x.length) for x in scaffolds_stats.values()]) if total_length == 0: return num_scaffolds = len(scaffolds_stats) num_contigs = sum([len(x) for x in scaffolds.values()]) scaffold_lengths = [int(s.length) for s in scaffolds_stats.values()] contig_lengths = [] for scf in scaffolds.values(): for ctg in scf: contig_lengths.append(int(contigs_stats[unsigned(ctg)].length)) largest_scf = max(scaffold_lengths) #ctg_n50 = _calc_n50(contig_lengths, total_length) scf_n50 = _calc_n50(scaffold_lengths, total_length) mean_read_cov = 0 for scf in scaffolds_stats.values(): mean_read_cov += int(scf.length) * int(scf.coverage) mean_read_cov //= total_length logger.info( "Assembly statistics:\n\n" "\tTotal length:\t%d\n" "\tFragments:\t%d\n" #"\tContigs N50:\t{2}\n" "\tFragments N50:\t%d\n" "\tLargest frg:\t%d\n" "\tScaffolds:\t%d\n" "\tMean coverage:\t%d\n", total_length, num_scaffolds, scf_n50, largest_scf, num_contigs - num_scaffolds, mean_read_cov)
def setup_params(args): logger.info("Configuring run") parameters = {} parameters["pipeline_version"] = cfg.vals["pipeline_version"] total_length = 0 read_lengths = [] MAX_READ_LEN = 2**31 - 1 lowest_read_len = cfg.vals["min_overlap_range"][args.read_type][0] if args.min_overlap: lowest_read_len = args.min_overlap passing_reads = 0 for read_file in args.reads: for _, seq_len in iteritems(fp.read_sequence_lengths(read_file)): if seq_len > MAX_READ_LEN: raise ConfigException( "Length of single read in '{}' exceeded maximum ({})". format(read_file, MAX_READ_LEN)) if seq_len > lowest_read_len: passing_reads += 1 total_length += seq_len read_lengths.append(seq_len) if not passing_reads: raise ConfigException( "No reads above minimum length threshold ({})".format( lowest_read_len)) _, reads_n50 = _calc_nx(read_lengths, total_length, 0.50) _, reads_n90 = _calc_nx(read_lengths, total_length, 0.90) #Selecting minimum overlap logger.info("Total read length: %d", total_length) if args.genome_size: coverage = total_length // args.genome_size logger.info("Input genome size: %d", args.genome_size) logger.info("Estimated coverage: %d", coverage) if coverage < 5 or coverage > 1000: logger.warning("Expected read coverage is " + str(coverage) + ", the assembly is not " + "guaranteed to be optimal in this setting." + " Are you sure that the genome size " + "was entered correctly?") logger.info("Reads N50/N90: %d / %d", reads_n50, reads_n90) if args.min_overlap is None: GRADE = 1000 int_min_ovlp = int(round(reads_n90 / GRADE)) * GRADE MIN_OVLP = cfg.vals["min_overlap_range"][args.read_type][0] MAX_OVLP = cfg.vals["min_overlap_range"][args.read_type][1] if args.meta: MAX_OVLP = min(MAX_OVLP, cfg.vals["max_meta_overlap"]) parameters["min_overlap"] = max(MIN_OVLP, min(MAX_OVLP, int_min_ovlp)) logger.info("Minimum overlap set to %d", parameters["min_overlap"]) else: parameters["min_overlap"] = args.min_overlap logger.info("Selected minimum overlap: %d", parameters["min_overlap"]) #Selecting k-mer size #parameters["kmer_size"] = cfg.vals["kmer_size"][args.read_type] #logger.info("Selected k-mer size: %d", parameters["kmer_size"]) #Downsampling reads for the first assembly stage to save memory target_cov = None if args.asm_coverage and args.asm_coverage < coverage: target_cov = args.asm_coverage if target_cov: logger.info("Using longest %dx reads for contig assembly", target_cov) min_read = _get_downsample_threshold(read_lengths, args.genome_size * target_cov) logger.debug("Min read length cutoff: %d", min_read) parameters["min_read_length"] = min_read else: parameters["min_read_length"] = 0 return parameters
def setup_params(args): logger.info("Configuring run") parameters = {} parameters["pipeline_version"] = cfg.vals["pipeline_version"] total_length = 0 read_lengths = [] for read_file in args.reads: for _, seq_len in iteritems(fp.read_sequence_lengths(read_file)): total_length += seq_len read_lengths.append(seq_len) _, reads_n50 = _calc_nx(read_lengths, total_length, 0.50) _, reads_n90 = _calc_nx(read_lengths, total_length, 0.90) #Selecting minimum overlap logger.info("Total read length: %d", total_length) coverage = total_length // args.genome_size logger.info("Input genome size: %d", args.genome_size) logger.info("Estimated coverage: %d", coverage) if coverage < 5 or coverage > 1000: logger.warning("Expected read coverage is " + str(coverage) + ", the assembly is not " + "guaranteed to be optimal in this setting." + " Are you sure that the genome size " + "was entered correctly?") logger.info("Reads N50/N90: %d / %d", reads_n50, reads_n90) if args.min_overlap is None: GRADE = 1000 int_min_ovlp = int(round(reads_n90 / GRADE)) * GRADE parameters["min_overlap"] = \ max(cfg.vals["min_overlap_range"][args.read_type][0], min(cfg.vals["min_overlap_range"][args.read_type][1], int_min_ovlp)) logger.info("Minimum overlap set to %d", parameters["min_overlap"]) else: parameters["min_overlap"] = args.min_overlap logger.info("Selected minimum overlap: %d", parameters["min_overlap"]) #Selecting k-mer size if args.genome_size < cfg.vals["big_genome_kmer"]: parameters["kmer_size"] = cfg.vals["kmer_size"][args.read_type][0] else: parameters["kmer_size"] = cfg.vals["kmer_size"][args.read_type][1] logger.info("Selected k-mer size: %d", parameters["kmer_size"]) #Downsampling reads for the first assembly stage to save memory target_cov = None if args.asm_coverage and args.asm_coverage < coverage: target_cov = args.asm_coverage #if not args.asm_coverage and args.genome_size >= 10 ** 9: # target_cov = cfg.vals["reduced_asm_cov"] if target_cov: logger.info("Using longest %dx reads for contig assembly", target_cov) min_read = _get_downsample_threshold(read_lengths, args.genome_size * target_cov) logger.debug("Min read length cutoff: %d", min_read) parameters["min_read_length"] = min_read else: parameters["min_read_length"] = 0 return parameters