Example #1
0
def short_statistics(fasta_file):
    lengths = list(fp.read_sequence_lengths(fasta_file).values())
    if not lengths:
        return 0, 0
    total_size = sum(lengths)
    return total_size, _calc_n50(lengths, total_size)
Example #2
0
def setup_params(args):
    logger.info("Configuring run")
    parameters = {}

    total_length = 0
    read_lengths = []
    for read_file in args.reads:
        for seq, seq_len in fp.read_sequence_lengths(read_file).iteritems():
            total_length += seq_len
            read_lengths.append(seq_len)

    _, reads_n50 = _calc_nx(read_lengths, total_length, 0.50)
    _, reads_n90 = _calc_nx(read_lengths, total_length, 0.90)

    #Selecting minimum overlap
    logger.debug("Total sequence length: {0}".format(total_length))

    coverage = total_length / args.genome_size
    logger.info("Input genome size: {0}".format(args.genome_size))
    logger.info("Estimated coverage: {0}".format(coverage))
    if coverage < 5 or coverage > 1000:
        logger.warning("Expected read coverage is " + str(coverage) +
                       ", the assembly is not " +
                       "guaranteed to be optimal in this setting." +
                       " Are you sure that the genome size " +
                       "was entered correctly?")

    logger.info("Reads N50/N90: {0} / {1}".format(reads_n50, reads_n90))
    if args.min_overlap is None:
        GRADE = 1000
        int_min_ovlp = int(round(float(reads_n90) / GRADE)) * GRADE

        parameters["min_overlap"] = \
            max(cfg.vals["min_overlap_range"][args.read_type][0],
                min(cfg.vals["min_overlap_range"][args.read_type][1], int_min_ovlp))
        logger.info("Minimum overlap set to {0}".format(
            parameters["min_overlap"]))
    else:
        parameters["min_overlap"] = args.min_overlap

    #Selecting k-mer size
    if args.genome_size < cfg.vals["big_genome_kmer"]:
        parameters["kmer_size"] = cfg.vals["kmer_size"][args.read_type][0]
    else:
        parameters["kmer_size"] = cfg.vals["kmer_size"][args.read_type][1]
    logger.info("Selected k-mer size: {0}".format(parameters["kmer_size"]))

    #Downsampling reads for the first assembly stage to save memory
    target_cov = None
    if args.asm_coverage and args.asm_coverage < coverage:
        target_cov = args.asm_coverage
    #if not args.asm_coverage and args.genome_size >= 10 ** 9:
    #    target_cov = cfg.vals["reduced_asm_cov"]

    if target_cov:
        logger.info(
            "Using longest {}x reads for contig assembly".format(target_cov))
        min_read = _get_downsample_threshold(read_lengths,
                                             args.genome_size * target_cov)
        logger.debug("Min read length cutoff: {0}".format(min_read))
        parameters["min_read_length"] = min_read
    else:
        parameters["min_read_length"] = 0

    return parameters
Example #3
0
def setup_params(args):
    logger.info("Configuring run")
    parameters = {}
    parameters["pipeline_version"] = cfg.vals["pipeline_version"]

    total_length = 0
    read_lengths = []
    MAX_READ_LEN = 2**31 - 1

    lowest_read_len = cfg.vals["min_overlap_range"][args.read_type][0]
    if args.min_overlap:
        lowest_read_len = args.min_overlap
    passing_reads = 0

    for read_file in args.reads:
        for _, seq_len in iteritems(fp.read_sequence_lengths(read_file)):
            if seq_len > MAX_READ_LEN:
                raise ConfigException(
                    "Length of single read in '{}' exceeded maximum ({})".
                    format(read_file, MAX_READ_LEN))
            if seq_len > lowest_read_len:
                passing_reads += 1

            total_length += seq_len
            read_lengths.append(seq_len)

    if not passing_reads:
        raise ConfigException(
            "No reads above minimum length threshold ({})".format(
                lowest_read_len))

    _, reads_n50 = _calc_nx(read_lengths, total_length, 0.50)
    _, reads_n90 = _calc_nx(read_lengths, total_length, 0.90)

    #Selecting minimum overlap
    logger.info("Total read length: %d", total_length)

    if args.genome_size:
        coverage = total_length // args.genome_size
        logger.info("Input genome size: %d", args.genome_size)
        logger.info("Estimated coverage: %d", coverage)
        if coverage < 5 or coverage > 1000:
            logger.warning("Expected read coverage is " + str(coverage) +
                           ", the assembly is not " +
                           "guaranteed to be optimal in this setting." +
                           " Are you sure that the genome size " +
                           "was entered correctly?")

    logger.info("Reads N50/N90: %d / %d", reads_n50, reads_n90)
    if args.min_overlap is None:
        GRADE = 1000
        int_min_ovlp = int(round(reads_n90 / GRADE)) * GRADE

        MIN_OVLP = cfg.vals["min_overlap_range"][args.read_type][0]
        MAX_OVLP = cfg.vals["min_overlap_range"][args.read_type][1]
        if args.meta:
            MAX_OVLP = min(MAX_OVLP, cfg.vals["max_meta_overlap"])
        parameters["min_overlap"] = max(MIN_OVLP, min(MAX_OVLP, int_min_ovlp))
        logger.info("Minimum overlap set to %d", parameters["min_overlap"])
    else:
        parameters["min_overlap"] = args.min_overlap
        logger.info("Selected minimum overlap: %d", parameters["min_overlap"])

    #Selecting k-mer size
    #parameters["kmer_size"] = cfg.vals["kmer_size"][args.read_type]
    #logger.info("Selected k-mer size: %d", parameters["kmer_size"])

    #Downsampling reads for the first assembly stage to save memory
    target_cov = None
    if args.asm_coverage and args.asm_coverage < coverage:
        target_cov = args.asm_coverage

    if target_cov:
        logger.info("Using longest %dx reads for contig assembly", target_cov)
        min_read = _get_downsample_threshold(read_lengths,
                                             args.genome_size * target_cov)
        logger.debug("Min read length cutoff: %d", min_read)
        parameters["min_read_length"] = min_read
    else:
        parameters["min_read_length"] = 0

    return parameters
Example #4
0
def setup_params(args):
    logger.info("Configuring run")
    parameters = {}
    parameters["pipeline_version"] = cfg.vals["pipeline_version"]

    total_length = 0
    read_lengths = []
    for read_file in args.reads:
        for _, seq_len in iteritems(fp.read_sequence_lengths(read_file)):
            total_length += seq_len
            read_lengths.append(seq_len)

    _, reads_n50 = _calc_nx(read_lengths, total_length, 0.50)
    _, reads_n90 = _calc_nx(read_lengths, total_length, 0.90)

    #Selecting minimum overlap
    logger.info("Total read length: %d", total_length)

    coverage = total_length // args.genome_size
    logger.info("Input genome size: %d", args.genome_size)
    logger.info("Estimated coverage: %d", coverage)
    if coverage < 5 or coverage > 1000:
        logger.warning("Expected read coverage is " + str(coverage) +
                       ", the assembly is not " +
                       "guaranteed to be optimal in this setting." +
                       " Are you sure that the genome size " +
                       "was entered correctly?")

    logger.info("Reads N50/N90: %d / %d", reads_n50, reads_n90)
    if args.min_overlap is None:
        GRADE = 1000
        int_min_ovlp = int(round(reads_n90 / GRADE)) * GRADE

        MIN_OVLP = cfg.vals["min_overlap_range"][args.read_type][0]
        MAX_OVLP = cfg.vals["min_overlap_range"][args.read_type][1]
        if args.meta:
            MAX_OVLP = min(MAX_OVLP, cfg.vals["max_meta_overlap"])
        parameters["min_overlap"] = max(MIN_OVLP, min(MAX_OVLP, int_min_ovlp))
        logger.info("Minimum overlap set to %d", parameters["min_overlap"])
    else:
        parameters["min_overlap"] = args.min_overlap
        logger.info("Selected minimum overlap: %d", parameters["min_overlap"])

    #Selecting k-mer size
    if args.genome_size < cfg.vals["big_genome_kmer"]:
        parameters["kmer_size"] = cfg.vals["kmer_size"][args.read_type][0]
    else:
        parameters["kmer_size"] = cfg.vals["kmer_size"][args.read_type][1]
    logger.info("Selected k-mer size: %d", parameters["kmer_size"])

    #Downsampling reads for the first assembly stage to save memory
    target_cov = None
    if args.asm_coverage and args.asm_coverage < coverage:
        target_cov = args.asm_coverage
    #if not args.asm_coverage and args.genome_size >= 10 ** 9:
    #    target_cov = cfg.vals["reduced_asm_cov"]

    if target_cov:
        logger.info("Using longest %dx reads for contig assembly", target_cov)
        min_read = _get_downsample_threshold(read_lengths,
                                             args.genome_size * target_cov)
        logger.debug("Min read length cutoff: %d", min_read)
        parameters["min_read_length"] = min_read
    else:
        parameters["min_read_length"] = 0

    return parameters