def short_statistics(fasta_file): lengths = list(fp.read_sequence_lengths(fasta_file).values()) if not lengths: return 0, 0 total_size = sum(lengths) return total_size, _calc_n50(lengths, total_size)
def setup_params(args): logger.info("Configuring run") parameters = {} total_length = 0 read_lengths = [] for read_file in args.reads: for seq, seq_len in fp.read_sequence_lengths(read_file).iteritems(): total_length += seq_len read_lengths.append(seq_len) _, reads_n50 = _calc_nx(read_lengths, total_length, 0.50) _, reads_n90 = _calc_nx(read_lengths, total_length, 0.90) #Selecting minimum overlap logger.debug("Total sequence length: {0}".format(total_length)) coverage = total_length / args.genome_size logger.info("Input genome size: {0}".format(args.genome_size)) logger.info("Estimated coverage: {0}".format(coverage)) if coverage < 5 or coverage > 1000: logger.warning("Expected read coverage is " + str(coverage) + ", the assembly is not " + "guaranteed to be optimal in this setting." + " Are you sure that the genome size " + "was entered correctly?") logger.info("Reads N50/N90: {0} / {1}".format(reads_n50, reads_n90)) if args.min_overlap is None: GRADE = 1000 int_min_ovlp = int(round(float(reads_n90) / GRADE)) * GRADE parameters["min_overlap"] = \ max(cfg.vals["min_overlap_range"][args.read_type][0], min(cfg.vals["min_overlap_range"][args.read_type][1], int_min_ovlp)) logger.info("Minimum overlap set to {0}".format( parameters["min_overlap"])) else: parameters["min_overlap"] = args.min_overlap #Selecting k-mer size if args.genome_size < cfg.vals["big_genome_kmer"]: parameters["kmer_size"] = cfg.vals["kmer_size"][args.read_type][0] else: parameters["kmer_size"] = cfg.vals["kmer_size"][args.read_type][1] logger.info("Selected k-mer size: {0}".format(parameters["kmer_size"])) #Downsampling reads for the first assembly stage to save memory target_cov = None if args.asm_coverage and args.asm_coverage < coverage: target_cov = args.asm_coverage #if not args.asm_coverage and args.genome_size >= 10 ** 9: # target_cov = cfg.vals["reduced_asm_cov"] if target_cov: logger.info( "Using longest {}x reads for contig assembly".format(target_cov)) min_read = _get_downsample_threshold(read_lengths, args.genome_size * target_cov) logger.debug("Min read length cutoff: {0}".format(min_read)) parameters["min_read_length"] = min_read else: parameters["min_read_length"] = 0 return parameters
def setup_params(args): logger.info("Configuring run") parameters = {} parameters["pipeline_version"] = cfg.vals["pipeline_version"] total_length = 0 read_lengths = [] MAX_READ_LEN = 2**31 - 1 lowest_read_len = cfg.vals["min_overlap_range"][args.read_type][0] if args.min_overlap: lowest_read_len = args.min_overlap passing_reads = 0 for read_file in args.reads: for _, seq_len in iteritems(fp.read_sequence_lengths(read_file)): if seq_len > MAX_READ_LEN: raise ConfigException( "Length of single read in '{}' exceeded maximum ({})". format(read_file, MAX_READ_LEN)) if seq_len > lowest_read_len: passing_reads += 1 total_length += seq_len read_lengths.append(seq_len) if not passing_reads: raise ConfigException( "No reads above minimum length threshold ({})".format( lowest_read_len)) _, reads_n50 = _calc_nx(read_lengths, total_length, 0.50) _, reads_n90 = _calc_nx(read_lengths, total_length, 0.90) #Selecting minimum overlap logger.info("Total read length: %d", total_length) if args.genome_size: coverage = total_length // args.genome_size logger.info("Input genome size: %d", args.genome_size) logger.info("Estimated coverage: %d", coverage) if coverage < 5 or coverage > 1000: logger.warning("Expected read coverage is " + str(coverage) + ", the assembly is not " + "guaranteed to be optimal in this setting." + " Are you sure that the genome size " + "was entered correctly?") logger.info("Reads N50/N90: %d / %d", reads_n50, reads_n90) if args.min_overlap is None: GRADE = 1000 int_min_ovlp = int(round(reads_n90 / GRADE)) * GRADE MIN_OVLP = cfg.vals["min_overlap_range"][args.read_type][0] MAX_OVLP = cfg.vals["min_overlap_range"][args.read_type][1] if args.meta: MAX_OVLP = min(MAX_OVLP, cfg.vals["max_meta_overlap"]) parameters["min_overlap"] = max(MIN_OVLP, min(MAX_OVLP, int_min_ovlp)) logger.info("Minimum overlap set to %d", parameters["min_overlap"]) else: parameters["min_overlap"] = args.min_overlap logger.info("Selected minimum overlap: %d", parameters["min_overlap"]) #Selecting k-mer size #parameters["kmer_size"] = cfg.vals["kmer_size"][args.read_type] #logger.info("Selected k-mer size: %d", parameters["kmer_size"]) #Downsampling reads for the first assembly stage to save memory target_cov = None if args.asm_coverage and args.asm_coverage < coverage: target_cov = args.asm_coverage if target_cov: logger.info("Using longest %dx reads for contig assembly", target_cov) min_read = _get_downsample_threshold(read_lengths, args.genome_size * target_cov) logger.debug("Min read length cutoff: %d", min_read) parameters["min_read_length"] = min_read else: parameters["min_read_length"] = 0 return parameters
def setup_params(args): logger.info("Configuring run") parameters = {} parameters["pipeline_version"] = cfg.vals["pipeline_version"] total_length = 0 read_lengths = [] for read_file in args.reads: for _, seq_len in iteritems(fp.read_sequence_lengths(read_file)): total_length += seq_len read_lengths.append(seq_len) _, reads_n50 = _calc_nx(read_lengths, total_length, 0.50) _, reads_n90 = _calc_nx(read_lengths, total_length, 0.90) #Selecting minimum overlap logger.info("Total read length: %d", total_length) coverage = total_length // args.genome_size logger.info("Input genome size: %d", args.genome_size) logger.info("Estimated coverage: %d", coverage) if coverage < 5 or coverage > 1000: logger.warning("Expected read coverage is " + str(coverage) + ", the assembly is not " + "guaranteed to be optimal in this setting." + " Are you sure that the genome size " + "was entered correctly?") logger.info("Reads N50/N90: %d / %d", reads_n50, reads_n90) if args.min_overlap is None: GRADE = 1000 int_min_ovlp = int(round(reads_n90 / GRADE)) * GRADE MIN_OVLP = cfg.vals["min_overlap_range"][args.read_type][0] MAX_OVLP = cfg.vals["min_overlap_range"][args.read_type][1] if args.meta: MAX_OVLP = min(MAX_OVLP, cfg.vals["max_meta_overlap"]) parameters["min_overlap"] = max(MIN_OVLP, min(MAX_OVLP, int_min_ovlp)) logger.info("Minimum overlap set to %d", parameters["min_overlap"]) else: parameters["min_overlap"] = args.min_overlap logger.info("Selected minimum overlap: %d", parameters["min_overlap"]) #Selecting k-mer size if args.genome_size < cfg.vals["big_genome_kmer"]: parameters["kmer_size"] = cfg.vals["kmer_size"][args.read_type][0] else: parameters["kmer_size"] = cfg.vals["kmer_size"][args.read_type][1] logger.info("Selected k-mer size: %d", parameters["kmer_size"]) #Downsampling reads for the first assembly stage to save memory target_cov = None if args.asm_coverage and args.asm_coverage < coverage: target_cov = args.asm_coverage #if not args.asm_coverage and args.genome_size >= 10 ** 9: # target_cov = cfg.vals["reduced_asm_cov"] if target_cov: logger.info("Using longest %dx reads for contig assembly", target_cov) min_read = _get_downsample_threshold(read_lengths, args.genome_size * target_cov) logger.debug("Min read length cutoff: %d", min_read) parameters["min_read_length"] = min_read else: parameters["min_read_length"] = 0 return parameters