Beispiel #1
0
def _compose_sequence(consensus_file):
    """
    Concatenates bubbles consensuses into genome
    """
    consensuses = defaultdict(list)
    coverage = defaultdict(list)
    with open(consensus_file, "r") as f:
        header = True
        for line in f:
            if header:
                tokens = line.strip().split(" ")
                if len(tokens) != 4:
                    raise Exception("Bubble format error")

                ctg_id = tokens[0][1:]
                ctg_pos = int(tokens[1])
                #coverage[ctg_id].append(int(tokens[2]))
                ctg_sub_pos = int(tokens[3])
            else:
                consensuses[ctg_id].append(
                    (ctg_pos, ctg_sub_pos, line.strip()))
            header = not header

    polished_fasta = {}
    polished_stats = {}
    for ctg_id, seqs in iteritems(consensuses):
        sorted_seqs = [p[2] for p in sorted(seqs, key=lambda p: (p[0], p[1]))]
        concat_seq = "".join(sorted_seqs)
        #mean_coverage = sum(coverage[ctg_id]) / len(coverage[ctg_id])
        polished_fasta[ctg_id] = concat_seq
        polished_stats[ctg_id] = len(concat_seq)

    return polished_fasta, polished_stats
Beispiel #2
0
def extract_unmapped_reads(args, reads2contigs_mapping, unmapped_reads_path,
                           mapping_rate_threshold):
    mapping_rates = calc_mapping_rates(reads2contigs_mapping)
    total_bases = 0
    unmapped_bases = 0

    with open(unmapped_reads_path, "w") as fout:
        for reads_file in args.reads:
            for hdr, sequence in fp.stream_sequence(reads_file):
                total_bases += len(sequence)

                is_unmapped = True
                contigs = mapping_rates.get(hdr)
                if contigs is not None:
                    is_unmapped = True
                    for _, mapping_rate in iteritems(contigs):
                        if mapping_rate >= mapping_rate_threshold:
                            is_unmapped = False

                if is_unmapped:
                    unmapped_bases += len(sequence)
                    fout.write(">{0}\n{1}\n".format(hdr, sequence))

    logger.debug("Unmapped sequence: %d / %d (%f)", unmapped_bases,
                 total_bases, unmapped_bases / total_bases)
Beispiel #3
0
def get_contigs_info(contigs_file):
    contigs_info = {}
    contigs_fasta = fp.read_sequence_dict(contigs_file)
    for ctg_id, ctg_seq in iteritems(contigs_fasta):
        contig_type = ctg_id.split("_")[0]
        contigs_info[ctg_id] = ContigInfo(ctg_id, len(ctg_seq), contig_type)

    return contigs_info
Beispiel #4
0
def trim_circular_reads(circular_reads, unmapped_reads):
    trimmed_circular_reads = dict()

    for i, (read, hit) in enumerate(iteritems(circular_reads)):
        sequence = unmapped_reads[read][:hit.target_start].upper()
        trimmed_circular_reads["circular_read_" + str(i)] = sequence

    return trimmed_circular_reads
Beispiel #5
0
def split_into_chunks(fasta_in, chunk_size):
    out_dict = {}
    for header, seq in iteritems(fasta_in):
        #print len(seq)
        for i in range(0, max(len(seq) // chunk_size, 1)):
            chunk_hdr = "{0}$chunk_{1}".format(header, i)
            start = i * chunk_size
            end = (i + 1) * chunk_size
            if len(seq) - end < chunk_size:
                end = len(seq)

            #print(start, end)
            out_dict[chunk_hdr] = seq[start : end]

    return out_dict
Beispiel #6
0
    def __init__(self, sam_alignment, reference_fasta, multiproc_manager,
                 max_coverage=None, use_secondary=False):
        #check that alignment exists
        if not os.path.exists(sam_alignment):
            raise AlignmentException("Can't open {0}".format(sam_alignment))
        if not os.path.exists(sam_alignment + ".bai"):
            raise AlignmentException("Bam not indexed: {0}".format(sam_alignment))

        #will not be changed during exceution, each process has its own copy
        self.aln_path = sam_alignment
        self.max_coverage = max_coverage
        self.use_secondary = use_secondary
        self.cigar_parser = re.compile(b"[0-9]+[MIDNSHP=X]")

        #self.shared_manager = multiprocessing.Manager()
        self.ref_fasta = multiproc_manager.dict()
        for (h, s) in iteritems(reference_fasta):
            self.ref_fasta[_BYTES(h)] = _BYTES(s)
Beispiel #7
0
    def __init__(self,
                 sam_alignment,
                 reference_fasta,
                 max_coverage=None,
                 use_secondary=False):
        #check that alignment exists
        if not os.path.exists(sam_alignment):
            raise AlignmentException("Can't open {0}".format(sam_alignment))

        #will not be changed during exceution, each process has its own copy
        self.aln_path = sam_alignment
        self.ref_fasta = {
            _BYTES(h): _BYTES(s)
            for (h, s) in iteritems(reference_fasta)
        }
        self.change_strand = True
        self.max_coverage = max_coverage
        self.use_secondary = use_secondary
        self.cigar_parser = re.compile(b"[0-9]+[MIDNSHP=X]")

        #will be shared between processes
        self.shared_manager = multiprocessing.Manager()
        self.shared_reader_queue = self.shared_manager.Queue()
        self.shared_num_jobs = multiprocessing.Value(ctypes.c_int, 0)
        self.shared_lock = self.shared_manager.Lock()
        self.shared_eof = multiprocessing.Value(ctypes.c_bool, False)

        #specific to IO thread
        self.io_thread = None
        self.terminate_flag = False
        self.processed_contigs = set()
        self.chunk_buffer = []
        self.current_contig = None

        #start IO thread
        self.io_thread = \
                multiprocessing.Process(target=SynchronizedSamReader._io_thread_worker,
                                        args=(self,))
        self.io_thread.start()
Beispiel #8
0
    def __init__(self, sam_alignment, reference_fasta,
                 max_coverage=None, use_secondary=False):
        #will not be changed during exceution, each process has its own copy
        self.aln_path = sam_alignment
        self.aln_file = None
        self.ref_fasta = {_BYTES(h) : _BYTES(s)
                          for (h, s) in iteritems(reference_fasta)}
        self.change_strand = True
        self.max_coverage = max_coverage
        self.seq_lengths = {}
        self.use_secondary = use_secondary
        self.cigar_parser = None
        self.processed_contigs = None

        #reading SAM header
        if not os.path.exists(self.aln_path):
            raise AlignmentException("Can't open {0}".format(self.aln_path))

        with open(self.aln_path, "rb") as f:
            for line in f:
                if not line or not _is_sam_header(line):
                    break
                if line.startswith(b"@SQ"):
                    seq_name = None
                    seq_len = None
                    for tag in line.split():
                        if tag.startswith(b"SN"):
                            seq_name = tag[3:]
                        if tag.startswith(b"LN"):
                            seq_len = int(tag[3:])
                    if seq_name and seq_len:
                        self.seq_lengths[_STR(seq_name)] = seq_len

        #will be shared between processes
        self.lock = multiprocessing.Lock()
        self.eof = multiprocessing.Value(ctypes.c_bool, False)
        self.position = multiprocessing.Value(ctypes.c_longlong, 0)
Beispiel #9
0
def dump_repeats(repeats_info, filename):
    with open(filename, "w") as f:
        for repeat_id, info in iteritems(repeats_info):
            f.write("#Repeat {0}\t{1}\n\n".format(repeat_id,
                                                  info.multiplicity))

            f.write("#All reads\t{0}\n".format(len(info.all_reads)))
            for read in info.all_reads:
                f.write(read + "\n")
            f.write("\n")

            for in_edge in info.in_reads:
                f.write("#Input {0}\t{1}\n".format(
                    in_edge, len(info.in_reads[in_edge])))
                for read in info.in_reads[in_edge]:
                    f.write(read + "\n")
                f.write("\n")

            for out_edge in info.out_reads:
                f.write("#Output {0}\t{1}\n".format(
                    out_edge, len(info.out_reads[out_edge])))
                for read in info.out_reads[out_edge]:
                    f.write(read + "\n")
                f.write("\n")
Beispiel #10
0
def generate_stats(repeat_file, polished_file, scaffolds, out_stats):
    """
    Compiles information from multiple stages
    """
    #contigs_length = {}
    #contigs_coverage = {}
    contigs_stats = {}
    header_line = "#seq_name\tlength\tcov.\tcirc.\trepeat\tmult.\tgraph_path"
    for line in open(repeat_file, "r"):
        if line.startswith("#"): continue
        tokens = line.strip().split("\t")
        contigs_stats[tokens[0]] = SeqStats(*tokens)
        #if polished_file is None:
        #contigs_length[tokens[0]] = int(tokens[1])
        #contigs_coverage[tokens[0]] = int(tokens[2])

    if polished_file is not None:
        for line in open(polished_file, "r"):
            if line.startswith("#"): continue
            tokens = line.strip().split("\t")
            contigs_stats[tokens[0]].length = tokens[1]
            contigs_stats[tokens[0]].coverage = tokens[2]

    scaffolds_stats = {}
    for scf, scf_seq in iteritems(scaffolds):
        scaffolds_stats[scf] = SeqStats(scf)
        scf_length = sum(
            [int(contigs_stats[unsigned(c)].length) for c in scf_seq])
        scf_length += (len(scf_seq) - 1) * cfg.vals["scaffold_gap"]
        scaffolds_stats[scf].length = str(scf_length)

        scf_cov = _mean(
            [int(contigs_stats[unsigned(c)].coverage) for c in scf_seq])
        scaffolds_stats[scf].coverage = str(scf_cov)

        scaffolds_stats[scf].repeat = contigs_stats[unsigned(
            scf_seq[0])].repeat
        scaffolds_stats[scf].circular = contigs_stats[unsigned(
            scf_seq[0])].circular

        scf_mult = min([int(contigs_stats[unsigned(c)].mult) for c in scf_seq])
        scaffolds_stats[scf].mult = str(scf_mult)

        #telomere information
        telomere_left = contigs_stats[unsigned(scf_seq[0])].telomere
        telomere_right = contigs_stats[unsigned(scf_seq[-1])].telomere
        if scf_seq[0][0] == "+":
            scf_left = telomere_left in ["left", "both"]
        else:
            scf_left = telomere_left in ["right", "both"]
        if scf_seq[-1][0] == "+":
            scf_right = telomere_right in ["right", "both"]
        else:
            scf_right = telomere_right in ["left", "both"]
        #if scf_left and scf_right: scaffolds_stats[scf].telomere = "both"
        #elif scf_left and not scf_right: scaffolds_stats[scf].telomere = "left"
        #elif not scf_left and scf_right: scaffolds_stats[scf].telomere = "right"
        #else: scaffolds_stats[scf].telomere = "none"

        #graph path
        path = []
        for ctg in scf_seq:
            ctg_path = contigs_stats[unsigned(ctg)].graph_path
            if ctg[0] == "-":
                ctg_path = ",".join(
                    [str(-int(x)) for x in ctg_path.split(",")][::-1])
            path.append(ctg_path)
        prefix = "*," if scf_left else ""
        suffix = ",*" if scf_right else ""
        scaffolds_stats[scf].graph_path = prefix + ",??,".join(path) + suffix

    with open(out_stats, "w") as f:
        f.write(header_line + "\n")
        for scf in sorted(scaffolds_stats,
                          key=lambda x: int(x.rsplit("_", 1)[-1])):
            scaffolds_stats[scf].print_out(f)

    total_length = sum([int(x.length) for x in scaffolds_stats.values()])
    if total_length == 0: return

    num_scaffolds = len(scaffolds_stats)
    num_contigs = sum([len(x) for x in scaffolds.values()])

    scaffold_lengths = [int(s.length) for s in scaffolds_stats.values()]
    contig_lengths = []
    for scf in scaffolds.values():
        for ctg in scf:
            contig_lengths.append(int(contigs_stats[unsigned(ctg)].length))
    largest_scf = max(scaffold_lengths)

    #ctg_n50 = _calc_n50(contig_lengths, total_length)
    scf_n50 = _calc_n50(scaffold_lengths, total_length)

    mean_read_cov = 0
    for scf in scaffolds_stats.values():
        mean_read_cov += int(scf.length) * int(scf.coverage)
    mean_read_cov //= total_length

    logger.info(
        "Assembly statistics:\n\n"
        "\tTotal length:\t%d\n"
        "\tFragments:\t%d\n"
        #"\tContigs N50:\t{2}\n"
        "\tFragments N50:\t%d\n"
        "\tLargest frg:\t%d\n"
        "\tScaffolds:\t%d\n"
        "\tMean coverage:\t%d\n",
        total_length,
        num_scaffolds,
        scf_n50,
        largest_scf,
        num_contigs - num_scaffolds,
        mean_read_cov)
Beispiel #11
0
def setup_params(args):
    logger.info("Configuring run")
    parameters = {}
    parameters["pipeline_version"] = cfg.vals["pipeline_version"]

    total_length = 0
    read_lengths = []
    MAX_READ_LEN = 2**31 - 1

    lowest_read_len = cfg.vals["min_overlap_range"][args.read_type][0]
    if args.min_overlap:
        lowest_read_len = args.min_overlap
    passing_reads = 0

    for read_file in args.reads:
        for _, seq_len in iteritems(fp.read_sequence_lengths(read_file)):
            if seq_len > MAX_READ_LEN:
                raise ConfigException(
                    "Length of single read in '{}' exceeded maximum ({})".
                    format(read_file, MAX_READ_LEN))
            if seq_len > lowest_read_len:
                passing_reads += 1

            total_length += seq_len
            read_lengths.append(seq_len)

    if not passing_reads:
        raise ConfigException(
            "No reads above minimum length threshold ({})".format(
                lowest_read_len))

    _, reads_n50 = _calc_nx(read_lengths, total_length, 0.50)
    _, reads_n90 = _calc_nx(read_lengths, total_length, 0.90)

    #Selecting minimum overlap
    logger.info("Total read length: %d", total_length)

    if args.genome_size:
        coverage = total_length // args.genome_size
        logger.info("Input genome size: %d", args.genome_size)
        logger.info("Estimated coverage: %d", coverage)
        if coverage < 5 or coverage > 1000:
            logger.warning("Expected read coverage is " + str(coverage) +
                           ", the assembly is not " +
                           "guaranteed to be optimal in this setting." +
                           " Are you sure that the genome size " +
                           "was entered correctly?")

    logger.info("Reads N50/N90: %d / %d", reads_n50, reads_n90)
    if args.min_overlap is None:
        GRADE = 1000
        int_min_ovlp = int(round(reads_n90 / GRADE)) * GRADE

        MIN_OVLP = cfg.vals["min_overlap_range"][args.read_type][0]
        MAX_OVLP = cfg.vals["min_overlap_range"][args.read_type][1]
        if args.meta:
            MAX_OVLP = min(MAX_OVLP, cfg.vals["max_meta_overlap"])
        parameters["min_overlap"] = max(MIN_OVLP, min(MAX_OVLP, int_min_ovlp))
        logger.info("Minimum overlap set to %d", parameters["min_overlap"])
    else:
        parameters["min_overlap"] = args.min_overlap
        logger.info("Selected minimum overlap: %d", parameters["min_overlap"])

    #Selecting k-mer size
    #parameters["kmer_size"] = cfg.vals["kmer_size"][args.read_type]
    #logger.info("Selected k-mer size: %d", parameters["kmer_size"])

    #Downsampling reads for the first assembly stage to save memory
    target_cov = None
    if args.asm_coverage and args.asm_coverage < coverage:
        target_cov = args.asm_coverage

    if target_cov:
        logger.info("Using longest %dx reads for contig assembly", target_cov)
        min_read = _get_downsample_threshold(read_lengths,
                                             args.genome_size * target_cov)
        logger.debug("Min read length cutoff: %d", min_read)
        parameters["min_read_length"] = min_read
    else:
        parameters["min_read_length"] = 0

    return parameters
Beispiel #12
0
def setup_params(args):
    logger.info("Configuring run")
    parameters = {}
    parameters["pipeline_version"] = cfg.vals["pipeline_version"]

    total_length = 0
    read_lengths = []
    for read_file in args.reads:
        for _, seq_len in iteritems(fp.read_sequence_lengths(read_file)):
            total_length += seq_len
            read_lengths.append(seq_len)

    _, reads_n50 = _calc_nx(read_lengths, total_length, 0.50)
    _, reads_n90 = _calc_nx(read_lengths, total_length, 0.90)

    #Selecting minimum overlap
    logger.info("Total read length: %d", total_length)

    coverage = total_length // args.genome_size
    logger.info("Input genome size: %d", args.genome_size)
    logger.info("Estimated coverage: %d", coverage)
    if coverage < 5 or coverage > 1000:
        logger.warning("Expected read coverage is " + str(coverage) +
                       ", the assembly is not " +
                       "guaranteed to be optimal in this setting." +
                       " Are you sure that the genome size " +
                       "was entered correctly?")

    logger.info("Reads N50/N90: %d / %d", reads_n50, reads_n90)
    if args.min_overlap is None:
        GRADE = 1000
        int_min_ovlp = int(round(reads_n90 / GRADE)) * GRADE

        parameters["min_overlap"] = \
            max(cfg.vals["min_overlap_range"][args.read_type][0],
                min(cfg.vals["min_overlap_range"][args.read_type][1], int_min_ovlp))
        logger.info("Minimum overlap set to %d", parameters["min_overlap"])
    else:
        parameters["min_overlap"] = args.min_overlap
        logger.info("Selected minimum overlap: %d", parameters["min_overlap"])

    #Selecting k-mer size
    if args.genome_size < cfg.vals["big_genome_kmer"]:
        parameters["kmer_size"] = cfg.vals["kmer_size"][args.read_type][0]
    else:
        parameters["kmer_size"] = cfg.vals["kmer_size"][args.read_type][1]
    logger.info("Selected k-mer size: %d", parameters["kmer_size"])

    #Downsampling reads for the first assembly stage to save memory
    target_cov = None
    if args.asm_coverage and args.asm_coverage < coverage:
        target_cov = args.asm_coverage
    #if not args.asm_coverage and args.genome_size >= 10 ** 9:
    #    target_cov = cfg.vals["reduced_asm_cov"]

    if target_cov:
        logger.info("Using longest %dx reads for contig assembly", target_cov)
        min_read = _get_downsample_threshold(read_lengths,
                                             args.genome_size * target_cov)
        logger.debug("Min read length cutoff: %d", min_read)
        parameters["min_read_length"] = min_read
    else:
        parameters["min_read_length"] = 0

    return parameters