Esempio n. 1
0
def downsample_kmers(tmp_dirpath, ref_fpath, kmc_db_fpath, kmer_len, log_fpath, err_fpath):
    downsampled_txt_fpath = join(tmp_dirpath, 'kmc.downsampled.txt')
    open(downsampled_txt_fpath, 'w').close()
    ref_kmers = dict()
    prev_kmer_idx = 0
    for chrom, seq in read_fasta(ref_fpath):
        kmc_fasta_fpath = join(tmp_dirpath, 'kmers_' + chrom + '.fasta')
        num_kmers_in_seq = len(seq) - kmer_len + 1
        with open(kmc_fasta_fpath, 'w') as out_f:
            for i in range(num_kmers_in_seq):
                out_f.write('>' + str(i) + '\n')
                out_f.write(seq[i: i + kmer_len] + '\n')
        filtered_fpath = join(tmp_dirpath, 'kmers_' + chrom + '.filtered.fasta')
        filter_contigs(kmc_fasta_fpath, filtered_fpath, kmc_db_fpath, log_fpath, err_fpath, min_kmers=1)
        filtered_kmers = set()
        for idx, _ in read_fasta(filtered_fpath):
            filtered_kmers.add(idx)
        with open(downsampled_txt_fpath, 'a') as out_f:
            kmer_i = 0
            for idx, seq in read_fasta(kmc_fasta_fpath):
                if idx in filtered_kmers:
                    if not kmer_i or int(idx) - kmer_i >= KMERS_INTERVAL:
                        kmer_i = int(idx)
                        out_f.write('>' + str(prev_kmer_idx + kmer_i) + '\n')
                        out_f.write(seq + '\n')
                        ref_kmers[prev_kmer_idx + kmer_i] = (chrom, kmer_i)
        prev_kmer_idx += num_kmers_in_seq
        if qconfig.space_efficient:
            os.remove(kmc_fasta_fpath)
    return ref_kmers, downsampled_txt_fpath
Esempio n. 2
0
def fill_gaps_mate_pair(bam_fpath, ref_fpath, assembly_fpath, assembly_covered_regions, output_dir, uncovered_fpath, err_fpath):
    matepair_reads_covered_regions = parse_uncovered_fpath(uncovered_fpath, ref_fpath, return_covered_regions=True)
    final_fasta = []
    matepair_regions = connect_with_matepairs(bam_fpath, output_dir, err_fpath)
    final_assembly_fpath = add_suffix(assembly_fpath, mp_polished_suffix)
    for name, seq in fastaparser.read_fasta(ref_fpath):
        covered_regions = list(find_overlaps(assembly_covered_regions[name], matepair_reads_covered_regions[name], overlap=50))
        total_contigs = 0
        if name not in matepair_regions or len(covered_regions) == 1:
            for region in covered_regions:
                final_fasta.append((name.split()[0] + "_" + str(total_contigs + 1), seq[region[0]: region[1]]))
                total_contigs += 1
        else:
            frags_to_merge = [covered_regions.pop(0)]
            sorted_mp_intervals = sorted(matepair_regions[name])
            while covered_regions:
                region2 = covered_regions.pop(0)
                if is_overlapped(frags_to_merge[-1], region2, sorted_mp_intervals):
                    frags_to_merge.append(region2)
                else:
                    merged_seq = merge_fragments_with_ns(seq, frags_to_merge)
                    final_fasta.append((name.split()[0] + "_" + str(total_contigs + 1), merged_seq))
                    total_contigs += 1
                    frags_to_merge = [region2]
            if frags_to_merge:
                merged_seq = merge_fragments_with_ns(seq, frags_to_merge)
                final_fasta.append((name.split()[0] + "_" + str(total_contigs + 1), merged_seq))
                total_contigs += 1
    fastaparser.write_fasta(final_assembly_fpath, final_fasta)
    return final_assembly_fpath
Esempio n. 3
0
def broke_scaffolds(file_counter, labels, contigs_fpath, corrected_dirpath, logs):
    logs.append('  ' + index_to_str(file_counter, force=(len(labels) > 1)) + '  breaking scaffolds into contigs:')
    contigs_fname = os.path.basename(contigs_fpath)
    fname, fasta_ext = splitext_for_fasta_file(contigs_fname)
    label = labels[file_counter]
    corr_fpath = unique_corrected_fpath(os.path.join(corrected_dirpath, slugify(label) + fasta_ext))
    corr_fpath_wo_ext = os.path.join(corrected_dirpath, name_from_fpath(corr_fpath))
    broken_scaffolds_fpath = corr_fpath_wo_ext + '_broken' + fasta_ext
    broken_scaffolds_fasta = []
    contigs_counter = 0

    scaffold_counter = 0
    for scaffold_counter, (name, seq) in enumerate(fastaparser.read_fasta(contigs_fpath)):
        if contigs_counter % 100 == 0:
            pass
        if contigs_counter > 520:
            pass
        total_contigs_for_the_scaf = split_by_ns(seq, name, broken_scaffolds_fasta, qconfig.Ns_break_threshold, qconfig.min_contig)
        contigs_counter += total_contigs_for_the_scaf
    if contigs_counter > scaffold_counter + 1:
        fastaparser.write_fasta(broken_scaffolds_fpath, broken_scaffolds_fasta)
        logs.append("  " + index_to_str(file_counter, force=(len(labels) > 1)) +
                    "    %d scaffolds (%s) were broken into %d contigs (%s)" %
                    (scaffold_counter + 1,
                     label,
                     contigs_counter,
                     label + '_broken'))
        return broken_scaffolds_fpath, logs

    logs.append("  " + index_to_str(file_counter, force=(len(labels) > 1)) +
                "    WARNING: nothing was broken, skipping '%s broken' from further analysis" % label)
    return None, logs
Esempio n. 4
0
def correct_fasta(original_fpath, min_contig, corrected_fpath=None, is_reference=False):
    modified_fasta_entries = []
    used_seq_names = defaultdict(int)
    for first_line, seq in fastaparser.read_fasta(original_fpath):
        if not first_line:
            logger.error('Skipping ' + original_fpath + ' because >sequence_name field is empty.', indent='    ')
            return False
        if (len(seq) >= min_contig) or is_reference:
            corr_name = correct_name(first_line)
            uniq_name = get_uniq_name(corr_name, used_seq_names)
            used_seq_names[corr_name] += 1

            if not qconfig.no_check:
                # seq to uppercase, because we later looking only uppercase letters
                corr_seq = correct_seq(seq, original_fpath)
                if not corr_seq:
                    return False
            else:
                corr_seq = seq
            modified_fasta_entries.append((uniq_name, corr_seq))

    if not modified_fasta_entries:
        logger.warning('Skipping ' + original_fpath + ' because file is empty.', indent='    ')
        return False
    if corrected_fpath:
        fastaparser.write_fasta(corrected_fpath, modified_fasta_entries)
    return True
Esempio n. 5
0
def correct_fasta(original_fpath, min_contig, corrected_fpath=None, is_reference=False):
    modified_fasta_entries = []
    used_seq_names = defaultdict(int)
    for first_line, seq in fastaparser.read_fasta(original_fpath):
        if not first_line:
            logger.error('Skipping ' + original_fpath + ' because >sequence_name field is empty.', indent='    ')
            return False
        if (len(seq) >= min_contig) or is_reference:
            corr_name = correct_name(first_line)
            uniq_name = get_uniq_name(corr_name, used_seq_names)
            used_seq_names[corr_name] += 1

            if not qconfig.no_check:
                # seq to uppercase, because we later looking only uppercase letters
                corr_seq = correct_seq(seq, original_fpath)
                if not corr_seq:
                    return False
            else:
                if re.compile(r'[^ACGTN]').search(seq):
                    logger.error('File ' + original_fpath + ' contains non-ACGTN characters. '
                                 'Please re-run QUAST without --no-check.', indent='    ', exit_with_code=1)
                    return False
                corr_seq = seq
            modified_fasta_entries.append((uniq_name, corr_seq))

    if not modified_fasta_entries:
        logger.warning('Skipping ' + original_fpath + ' because file is empty.', indent='    ')
        return False
    if corrected_fpath:
        fastaparser.write_fasta(corrected_fpath, modified_fasta_entries)
    return True
Esempio n. 6
0
def broke_scaffolds(file_counter, labels, contigs_fpath, corrected_dirpath, logs):
    logs.append('  ' + index_to_str(file_counter, force=(len(labels) > 1)) + '  breaking scaffolds into contigs:')
    contigs_fname = os.path.basename(contigs_fpath)
    fname, fasta_ext = splitext_for_fasta_file(contigs_fname)
    label = labels[file_counter]
    corr_fpath = unique_corrected_fpath(os.path.join(corrected_dirpath, slugify(label) + fasta_ext))
    corr_fpath_wo_ext = os.path.join(corrected_dirpath, name_from_fpath(corr_fpath))
    broken_scaffolds_fpath = corr_fpath_wo_ext + '_broken' + fasta_ext
    broken_scaffolds_fasta = []
    contigs_counter = 0

    scaffold_counter = 0
    for scaffold_counter, (name, seq) in enumerate(fastaparser.read_fasta(contigs_fpath)):
        if contigs_counter % 100 == 0:
            pass
        if contigs_counter > 520:
            pass
        total_contigs_for_the_scaf = split_by_ns(seq, name, broken_scaffolds_fasta, qconfig.Ns_break_threshold, qconfig.min_contig)
        contigs_counter += total_contigs_for_the_scaf
    if contigs_counter > scaffold_counter + 1:
        fastaparser.write_fasta(broken_scaffolds_fpath, broken_scaffolds_fasta)
        logs.append("  " + index_to_str(file_counter, force=(len(labels) > 1)) +
                    "    %d scaffolds (%s) were broken into %d contigs (%s)" %
                    (scaffold_counter + 1,
                     label,
                     contigs_counter,
                     label + '_broken'))
        return broken_scaffolds_fpath, logs

    logs.append("  " + index_to_str(file_counter, force=(len(labels) > 1)) +
                "    WARNING: nothing was broken, skipping '%s broken' from further analysis" % label)
    return None, logs
Esempio n. 7
0
def correct_fasta(original_fpath, corrected_fpath, min_contig,
                  is_reference=False):
    modified_fasta_entries = []
    used_seq_names = defaultdict(int)
    for first_line, seq in fastaparser.read_fasta(original_fpath):
        if not first_line:
            logger.warning('Skipping ' + original_fpath + ' because >sequence_name field is empty.',
                    indent='    ')
            return False
        if (len(seq) >= min_contig) or is_reference:
            corr_name = correct_name(first_line)
            uniq_name = get_uniq_name(corr_name, used_seq_names)
            used_seq_names[corr_name] += 1

            if not qconfig.no_check:
                # seq to uppercase, because we later looking only uppercase letters
                corr_seq = correct_seq(seq, original_fpath)
                if not corr_seq:
                    return False
            else:
                corr_seq = seq
            modified_fasta_entries.append((uniq_name, corr_seq))

    fastaparser.write_fasta(corrected_fpath, modified_fasta_entries)

    if is_reference:
        ref_len = sum(len(chr_seq) for (chr_name, chr_seq) in modified_fasta_entries)
        if ref_len > qconfig.MAX_REFERENCE_FILE_LENGTH:
            qconfig.splitted_ref = []  # important for MetaQUAST which runs QUAST multiple times
            _, fasta_ext = os.path.splitext(corrected_fpath)
            split_ref_dirpath = os.path.join(os.path.dirname(corrected_fpath), 'split_ref')
            if os.path.exists(split_ref_dirpath):
                shutil.rmtree(split_ref_dirpath, ignore_errors=True)
            os.makedirs(split_ref_dirpath)
            max_len = min(ref_len/qconfig.max_threads, qconfig.MAX_REFERENCE_LENGTH)
            cur_part_len = 0
            cur_part_num = 1
            cur_part_fpath = os.path.join(split_ref_dirpath, "part_%d" % cur_part_num) + fasta_ext

            for (chr_name, chr_seq) in modified_fasta_entries:
                cur_chr_len = len(chr_seq)
                if cur_chr_len > qconfig.MAX_REFERENCE_LENGTH:
                    logger.warning("Skipping chromosome " + chr_name + " because its length is greater than " +
                            str(qconfig.MAX_REFERENCE_LENGTH) + " (Nucmer's constraint).")
                    continue

                cur_part_len += cur_chr_len
                if cur_part_len > max_len and cur_part_len != cur_chr_len:
                    qconfig.splitted_ref.append(cur_part_fpath)
                    cur_part_len = cur_chr_len
                    cur_part_num += 1
                    cur_part_fpath = os.path.join(split_ref_dirpath, "part_%d" % cur_part_num) + fasta_ext
                fastaparser.write_fasta(cur_part_fpath, [(chr_name, chr_seq)], mode='a')
            if cur_part_len > 0:
                qconfig.splitted_ref.append(cur_part_fpath)
            if len(qconfig.splitted_ref) == 0:
                logger.warning("Skipping reference because all of its chromosomes exceeded Nucmer's constraint.")
                return False
    return True
Esempio n. 8
0
def broke_scaffolds(file_counter, labels, contigs_fpath, corrected_dirpath, logs):
    logs.append('  ' + index_to_str(file_counter, force=(len(labels) > 1)) + '  breaking scaffolds into contigs:')
    contigs_fname = os.path.basename(contigs_fpath)
    fname, fasta_ext = splitext_for_fasta_file(contigs_fname)
    label = labels[file_counter]
    corr_fpath = unique_corrected_fpath(os.path.join(corrected_dirpath, slugify(label) + fasta_ext))
    corr_fpath_wo_ext = os.path.join(corrected_dirpath, name_from_fpath(corr_fpath))
    broken_scaffolds_fpath = corr_fpath_wo_ext + '_broken' + fasta_ext
    broken_scaffolds_fasta = []
    contigs_counter = 0

    scaffold_counter = 0
    is_broken = False
    for scaffold_counter, (name, seq) in enumerate(fastaparser.read_fasta(contigs_fpath)):
        if contigs_counter % 100 == 0:
            pass
        if contigs_counter > 520:
            pass
        cumul_contig_length = 0
        total_contigs_for_the_scaf = 0
        cur_contig_start = 0
        while (cumul_contig_length < len(seq)) and (seq.find('N', cumul_contig_length) != -1):
            start = seq.find("N", cumul_contig_length)
            end = start + 1
            while (end != len(seq)) and (seq[end] == 'N'):
                end += 1

            cumul_contig_length = end + 1
            if end - start >= qconfig.Ns_break_threshold:
                is_broken = True
                if start - cur_contig_start >= qconfig.min_contig:
                    broken_scaffolds_fasta.append(
                        (name.split()[0] + "_" +
                         str(total_contigs_for_the_scaf + 1),
                         seq[cur_contig_start:start]))
                    total_contigs_for_the_scaf += 1
                cur_contig_start = end

        if len(seq) - cur_contig_start >= qconfig.min_contig:
            broken_scaffolds_fasta.append(
                (name.split()[0] + "_" +
                 str(total_contigs_for_the_scaf + 1),
                 seq[cur_contig_start:]))
            total_contigs_for_the_scaf += 1

        contigs_counter += total_contigs_for_the_scaf
    if is_broken:
        fastaparser.write_fasta(broken_scaffolds_fpath, broken_scaffolds_fasta)
        logs.append("  " + index_to_str(file_counter, force=(len(labels) > 1)) +
                    "    %d scaffolds (%s) were broken into %d contigs (%s)" %
                    (scaffold_counter + 1,
                     label,
                     contigs_counter,
                     label + '_broken'))
        return broken_scaffolds_fpath, logs

    logs.append("  " + index_to_str(file_counter, force=(len(labels) > 1)) +
                "    WARNING: nothing was broken, skipping '%s broken' from further analysis" % label)
    return None, logs
Esempio n. 9
0
def save_circos_GC(ref_fpath, reference_length, gc_fpath):
    window_size = set_window_size(reference_length)
    with open(gc_fpath, 'w') as out_f:
        for name, seq_full in fastaparser.read_fasta(ref_fpath):
            for i in range(0, len(seq_full), window_size):
                seq = seq_full[i:i + window_size]
                GC_percent = get_GC_percent(seq, window_size)
                out_f.write('\t'.join([name, str(i), str(i + window_size), str(GC_percent) + '\n']))
Esempio n. 10
0
def save_circos_GC(ref_fpath, reference_length, gc_fpath):
    window_size = set_window_size(reference_length)
    with open(gc_fpath, 'w') as out_f:
        for name, seq_full in fastaparser.read_fasta(ref_fpath):
            for i in range(0, len(seq_full), window_size):
                seq = seq_full[i:i + window_size]
                GC_percent = get_GC_percent(seq, window_size)
                out_f.write('\t'.join([name, str(i), str(i + window_size), str(GC_percent) + '\n']))
Esempio n. 11
0
def glimmerHMM(tool_dir, fasta_fpath, out_fpath, gene_lengths, err_path, tmp_dir, index):
    def run(contig_path, tmp_path):
        with open(err_path, 'a') as err_file:
            return_code = qutils.call_subprocess(
                [tool_exec, contig_path, '-d', trained_dir, '-g', '-o', tmp_path],
                stdout=err_file,
                stderr=err_file,
                indent='  ' + qutils.index_to_str(index) + '  ')
            return return_code

    tool_exec = os.path.join(tool_dir, 'glimmerhmm')

    # Note: why arabidopsis? for no particular reason, really.
    trained_dir = os.path.join(tool_dir, 'trained', 'arabidopsis')

    contigs = {}
    gffs = []
    base_dir = tempfile.mkdtemp(dir=tmp_dir)
    for seq_num, (ind, seq) in enumerate(read_fasta(fasta_fpath)):
        seq_num = str(seq_num)
        ind = ind[:qutils.MAX_CONTIG_NAME_GLIMMER]
        contig_path = os.path.join(base_dir, seq_num + '.fasta')
        gff_path = os.path.join(base_dir, seq_num + '.gff')

        write_fasta(contig_path, [(ind, seq)])
        if run(contig_path, gff_path) == 0:
            gffs.append(gff_path)
            contigs[ind] = seq

    if not gffs:
        return None, None, None, None

    out_gff_fpath = out_fpath + '_genes.gff' + ('.gz' if not qconfig.no_gzip else '')
    out_gff_path = merge_gffs(gffs, out_gff_fpath)
    unique, total = set(), 0
    genes = []
    for contig, gene_id, start, end, strand in parse_gff(out_gff_path):
        total += 1
        if strand == '+':
            gene_seq = contigs[contig][start - 1:end]
        else:
            gene_seq = rev_comp(contigs[contig][start - 1:end])
        if gene_seq not in unique:
            unique.add(gene_seq)
        gene = Gene(contig=contig, start=start, end=end, strand=strand, seq=gene_seq)
        gene.is_full = gene.start > 1 and gene.end < len(contigs[contig])
        genes.append(gene)

    full_cnt = [sum([gene.end - gene.start >= threshold for gene in genes if gene.is_full]) for threshold in gene_lengths]
    partial_cnt = [sum([gene.end - gene.start >= threshold for gene in genes if not gene.is_full]) for threshold in gene_lengths]
    if OUTPUT_FASTA:
        out_fasta_fpath = out_fpath + '_genes.fasta'
        add_genes_to_fasta(genes, out_fasta_fpath)
    if not qconfig.debug:
        shutil.rmtree(base_dir)

    #return out_gff_path, out_fasta_path, len(unique), total, cnt
    return out_gff_path, genes, len(unique), total, full_cnt, partial_cnt
Esempio n. 12
0
def glimmerHMM(tool_dir, fasta_fpath, out_fpath, gene_lengths, err_path, tmp_dir, index):
    def run(contig_path, tmp_path):
        with open(err_path, 'a') as err_file:
            return_code = qutils.call_subprocess(
                [tool_exec, contig_path, '-d', trained_dir, '-g', '-o', tmp_path],
                stdout=err_file,
                stderr=err_file,
                indent='  ' + qutils.index_to_str(index) + '  ')
            return return_code

    tool_exec = os.path.join(tool_dir, 'glimmerhmm')

    # Note: why arabidopsis? for no particular reason, really.
    trained_dir = os.path.join(tool_dir, 'trained', 'arabidopsis')

    contigs = {}
    gffs = []
    base_dir = tempfile.mkdtemp(dir=tmp_dir)
    for seq_num, (ind, seq) in enumerate(read_fasta(fasta_fpath)):
        seq_num = str(seq_num)
        ind = ind[:qutils.MAX_CONTIG_NAME_GLIMMER]
        contig_path = os.path.join(base_dir, seq_num + '.fasta')
        gff_path = os.path.join(base_dir, seq_num + '.gff')

        write_fasta(contig_path, [(ind, seq)])
        if run(contig_path, gff_path) == 0:
            gffs.append(gff_path)
            contigs[ind] = seq

    if not gffs:
        return None, None, None, None

    out_gff_fpath = out_fpath + '_genes.gff' + ('.gz' if not qconfig.no_gzip else '')
    out_gff_path = merge_gffs(gffs, out_gff_fpath)
    unique, total = set(), 0
    genes = []
    for contig, gene_id, start, end, strand in parse_gff(out_gff_path):
        total += 1
        if strand == '+':
            gene_seq = contigs[contig][start - 1:end]
        else:
            gene_seq = rev_comp(contigs[contig][start - 1:end])
        if gene_seq not in unique:
            unique.add(gene_seq)
        gene = Gene(contig=contig, start=start, end=end, strand=strand, seq=gene_seq)
        gene.is_full = gene.start > 1 and gene.end < len(contigs[contig])
        genes.append(gene)

    full_cnt = [sum([gene.end - gene.start >= threshold for gene in genes if gene.is_full]) for threshold in gene_lengths]
    partial_cnt = [sum([gene.end - gene.start >= threshold for gene in genes if not gene.is_full]) for threshold in gene_lengths]
    if OUTPUT_FASTA:
        out_fasta_fpath = out_fpath + '_genes.fasta'
        add_genes_to_fasta(genes, out_fasta_fpath)
    if not qconfig.debug:
        shutil.rmtree(base_dir)

    #return out_gff_path, out_fasta_path, len(unique), total, cnt
    return out_gff_path, genes, len(unique), total, full_cnt, partial_cnt
Esempio n. 13
0
def GC_content(contigs_fpath, skip=False):
    """
       Returns percent of GC for assembly and GC distribution: (list of GC%, list of # windows)
    """
    total_GC_amount = 0
    total_contig_length = 0
    GC_contigs_bin_num = int(100 / qconfig.GC_contig_bin_size) + 1
    GC_contigs_distribution_x = [
        i * qconfig.GC_contig_bin_size for i in range(0, GC_contigs_bin_num)
    ]  # list of X-coordinates, i.e. GC %
    GC_contigs_distribution_y = [
        0
    ] * GC_contigs_bin_num  # list of Y-coordinates, i.e. # contigs with GC % = x

    GC_bin_num = int(100 / qconfig.GC_bin_size) + 1
    GC_distribution_x = [
        i * qconfig.GC_bin_size for i in range(0, GC_bin_num)
    ]  # list of X-coordinates, i.e. GC %
    GC_distribution_y = [
        0
    ] * GC_bin_num  # list of Y-coordinates, i.e. # windows with GC % = x
    total_GC = None
    if skip:
        return total_GC, (GC_distribution_x,
                          GC_distribution_y), (GC_contigs_distribution_x,
                                               GC_contigs_distribution_y)

    for name, seq_full in fastaparser.read_fasta(
            contigs_fpath):  # in tuples: (name, seq)
        contig_ACGT_len = len(seq_full) - seq_full.count("N")
        if not contig_ACGT_len:
            continue
        contig_GC_len = seq_full.count("G") + seq_full.count("C")
        contig_GC_percent = 100.0 * contig_GC_len / contig_ACGT_len
        GC_contigs_distribution_y[int(contig_GC_percent //
                                      qconfig.GC_contig_bin_size)] += 1

        n = 100  # blocks of length 100
        # non-overlapping windows
        for seq in (seq_full[i:i + n] for i in range(0, len(seq_full), n)):
            GC_percent = get_GC_percent(seq, n)
            if not GC_percent:
                continue
            GC_distribution_y[int(
                int(GC_percent / qconfig.GC_bin_size) *
                qconfig.GC_bin_size)] += 1
        total_GC_amount += contig_GC_len
        total_contig_length += contig_ACGT_len

    if total_contig_length == 0:
        total_GC = None
    else:
        total_GC = total_GC_amount * 100.0 / total_contig_length

    return total_GC, (GC_distribution_x,
                      GC_distribution_y), (GC_contigs_distribution_x,
                                           GC_contigs_distribution_y)
Esempio n. 14
0
def save_icarus_GC(ref_fpath, gc_fpath):
    chr_index = 0
    n = qconfig.GC_window_size_large if qconfig.large_genome else qconfig.GC_window_size  # non-overlapping windows
    with open(gc_fpath, 'w') as out_f:
        for name, seq_full in fastaparser.read_fasta(ref_fpath):
            out_f.write('#' + name + ' ' + str(chr_index) + '\n')
            for i in range(0, len(seq_full), n):
                seq = seq_full[i:i + n]
                GC_percent = get_GC_percent(seq, n)
                out_f.write(str(chr_index) + ' ' + str(GC_percent) + '\n')
Esempio n. 15
0
def save_icarus_GC(ref_fpath, gc_fpath):
    chr_index = 0
    n = qconfig.GC_window_size_large if qconfig.large_genome else qconfig.GC_window_size  # non-overlapping windows
    with open(gc_fpath, 'w') as out_f:
        for name, seq_full in fastaparser.read_fasta(ref_fpath):
            out_f.write('#' + name + ' ' + str(chr_index) + '\n')
            for i in range(0, len(seq_full), n):
                seq = seq_full[i:i + n]
                GC_percent = get_GC_percent(seq, n)
                out_f.write(str(chr_index) + ' ' + str(GC_percent) + '\n')
Esempio n. 16
0
def remove_repeat_regions(ref_fpath, repeats_fpath, uncovered_fpath):
    repeats_regions = parse_bed(repeats_fpath)
    uncovered_regions = parse_bed(uncovered_fpath)
    unique_regions = defaultdict(list)
    for name, seq in fastaparser.read_fasta(ref_fpath):
        if name in repeats_regions:
            cur_contig_start = 0
            for start, end in repeats_regions[name]:
                if start > cur_contig_start:
                    unique_regions[name].append([cur_contig_start, start])
                else:
                    unique_regions[name].append([cur_contig_start, cur_contig_start])
                    unique_regions[name].append([start, start])
                cur_contig_start = end + 1
            if cur_contig_start < len(seq):
                unique_regions[name].append([cur_contig_start, len(seq)])
        else:
            unique_regions[name].append([0, len(seq)])
    unique_covered_regions = defaultdict(list)
    for name, regions in unique_regions.items():
        if name in uncovered_regions:
            cur_contig_idx = 0
            cur_contig_start, cur_contig_end = unique_regions[name][cur_contig_idx]
            for uncov_start, uncov_end in uncovered_regions[name]:
                while cur_contig_end < uncov_start:
                    unique_covered_regions[name].append([cur_contig_start, cur_contig_end])
                    cur_contig_idx += 1
                    if cur_contig_idx >= len(unique_regions[name]):
                        break
                    cur_contig_start, cur_contig_end = unique_regions[name][cur_contig_idx]
                if uncov_end < cur_contig_start:
                    continue
                if uncov_start <= cur_contig_start and uncov_end >= cur_contig_end:
                    cur_contig_idx += 1
                    if cur_contig_idx >= len(unique_regions[name]):
                        break
                    cur_contig_start, cur_contig_end = unique_regions[name][cur_contig_idx]
                elif cur_contig_start <= uncov_start <= cur_contig_end or cur_contig_start <= uncov_end <= cur_contig_end:
                    if uncov_start > cur_contig_start:
                        unique_covered_regions[name].append([cur_contig_start, uncov_start])
                    if uncov_end < cur_contig_end:
                        cur_contig_start = uncov_end
                    else:
                        cur_contig_idx += 1
                        if cur_contig_idx >= len(unique_regions[name]):
                            break
                        cur_contig_start, cur_contig_end = unique_regions[name][cur_contig_idx]
                else:
                    unique_covered_regions[name].append([cur_contig_start, cur_contig_end])
            for contig in unique_regions[name][cur_contig_idx:]:
                unique_covered_regions[name].append(contig)
        else:
            unique_covered_regions[name] = unique_regions[name]
    return unique_covered_regions
Esempio n. 17
0
def broke_scaffolds(file_counter, labels, contigs_fpath, corrected_dirpath, logs):
    logs.append('  ' + index_to_str(file_counter, force=(len(labels) > 1)) + '  breaking scaffolds into contigs:')
    contigs_fname = os.path.basename(contigs_fpath)
    fname, fasta_ext = splitext_for_fasta_file(contigs_fname)
    label = labels[file_counter]
    corr_fpath = unique_corrected_fpath(os.path.join(corrected_dirpath, slugify(label) + fasta_ext))
    corr_fpath_wo_ext = os.path.join(corrected_dirpath, name_from_fpath(corr_fpath))
    broken_scaffolds_fpath = corr_fpath_wo_ext + '_broken' + fasta_ext
    broken_scaffolds_fasta = []
    contigs_counter = 0

    scaffold_counter = 0
    for scaffold_counter, (name, seq) in enumerate(fastaparser.read_fasta(contigs_fpath)):
        if contigs_counter % 100 == 0:
            pass
        if contigs_counter > 520:
            pass
        cumul_contig_length = 0
        total_contigs_for_the_scaf = 1
        cur_contig_start = 0
        while (cumul_contig_length < len(seq)) and (seq.find('N', cumul_contig_length) != -1):
            start = seq.find("N", cumul_contig_length)
            end = start + 1
            while (end != len(seq)) and (seq[end] == 'N'):
                end += 1

            cumul_contig_length = end + 1
            if (end - start) >= qconfig.Ns_break_threshold:
                broken_scaffolds_fasta.append(
                    (name.split()[0] + "_" +
                     str(total_contigs_for_the_scaf),
                     seq[cur_contig_start:start]))
                total_contigs_for_the_scaf += 1
                cur_contig_start = end

        broken_scaffolds_fasta.append(
            (name.split()[0] + "_" +
             str(total_contigs_for_the_scaf),
             seq[cur_contig_start:]))

        contigs_counter += total_contigs_for_the_scaf
    if scaffold_counter + 1 != contigs_counter:
        fastaparser.write_fasta(broken_scaffolds_fpath, broken_scaffolds_fasta)
        logs.append("  " + index_to_str(file_counter, force=(len(labels) > 1)) +
                    "    %d scaffolds (%s) were broken into %d contigs (%s)" %
                    (scaffold_counter + 1,
                     label,
                     contigs_counter,
                     label + '_broken'))
        return broken_scaffolds_fpath, logs

    logs.append("  " + index_to_str(file_counter, force=(len(labels) > 1)) +
            "    WARNING: nothing was broken, skipping '%s broken' from further analysis" % label)
    return None, logs
Esempio n. 18
0
def fill_gaps_single(ref_fpath, assembly_fpath, assembly_covered_regions, uncovered_fpath):
    single_reads_covered_regions = parse_uncovered_fpath(uncovered_fpath, ref_fpath, return_covered_regions=True)
    final_assembly_fpath = add_suffix(assembly_fpath, single_polished_suffix)
    final_fasta = []
    for name, seq in fastaparser.read_fasta(ref_fpath):
        covered_regions = find_overlaps(assembly_covered_regions[name], single_reads_covered_regions[name], overlap=50)
        for i, region in enumerate(covered_regions):
            start, end = region
            final_fasta.append((name.split()[0] + "_" + str(i + 1), seq[start: end]))
    fastaparser.write_fasta(final_assembly_fpath, final_fasta)
    return final_assembly_fpath
Esempio n. 19
0
def preprocess_reference(ref_fpath, tmp_dir, uncovered_fpath):
    uncovered_regions = parse_uncovered_fpath(uncovered_fpath, ref_fpath, return_covered_regions=False)
    splitted_fasta = []
    for name, seq in fastaparser.read_fasta(ref_fpath):
        if name in uncovered_regions:
            cur_contig_start = 0
            total_contigs = 0
            for start, end in uncovered_regions[name]:
                total_contigs = split_by_ns(seq[cur_contig_start: start], name, splitted_fasta, total_contigs=total_contigs)
                cur_contig_start = end
            split_by_ns(seq[cur_contig_start:], name, splitted_fasta, total_contigs=total_contigs)
        else:
            split_by_ns(seq, name, splitted_fasta)
    processed_ref_fpath = join(tmp_dir, basename(ref_fpath))
    fastaparser.write_fasta(processed_ref_fpath, splitted_fasta)
    return processed_ref_fpath
Esempio n. 20
0
def remove_repeat_regions(ref_fpath, repeats_fpath, insert_size, tmp_dir,
                          uncovered_fpath, err_fpath):
    merged_fpath = merge_bed(repeats_fpath, uncovered_fpath, insert_size,
                             tmp_dir, err_fpath)
    regions_to_remove = parse_uncovered_fpath(merged_fpath,
                                              ref_fpath,
                                              return_covered_regions=False)
    unique_regions = defaultdict(list)
    for name, seq in fastaparser.read_fasta(ref_fpath):
        if name in regions_to_remove:
            cur_contig_start = 0
            for start, end in regions_to_remove[name]:
                if start > cur_contig_start:
                    unique_regions[name].append([cur_contig_start, start])
                cur_contig_start = end + 1
            if cur_contig_start < len(seq):
                unique_regions[name].append([cur_contig_start, len(seq)])
        else:
            unique_regions[name].append([0, len(seq)])
    return unique_regions
Esempio n. 21
0
def GC_content(contigs_fpath, skip=False):
    """
       Returns percent of GC for assembly and GC distribution: (list of GC%, list of # windows)
    """
    total_GC_amount = 0
    total_contig_length = 0
    GC_contigs_bin_num = int(100 / qconfig.GC_contig_bin_size) + 1
    GC_contigs_distribution_x = [i * qconfig.GC_contig_bin_size for i in range(0, GC_contigs_bin_num)] # list of X-coordinates, i.e. GC %
    GC_contigs_distribution_y = [0] * GC_contigs_bin_num # list of Y-coordinates, i.e. # contigs with GC % = x

    GC_bin_num = int(100 / qconfig.GC_bin_size) + 1
    GC_distribution_x = [i * qconfig.GC_bin_size for i in range(0, GC_bin_num)] # list of X-coordinates, i.e. GC %
    GC_distribution_y = [0] * GC_bin_num # list of Y-coordinates, i.e. # windows with GC % = x
    total_GC = None
    if skip:
        return total_GC, (GC_distribution_x, GC_distribution_y), (GC_contigs_distribution_x, GC_contigs_distribution_y)

    for name, seq_full in fastaparser.read_fasta(contigs_fpath): # in tuples: (name, seq)
        contig_ACGT_len = len(seq_full) - seq_full.count("N")
        if not contig_ACGT_len:
            continue
        contig_GC_len = seq_full.count("G") + seq_full.count("C")
        contig_GC_percent = 100.0 * contig_GC_len / contig_ACGT_len
        GC_contigs_distribution_y[int(contig_GC_percent // qconfig.GC_contig_bin_size)] += 1

        n = 100 # blocks of length 100
        # non-overlapping windows
        for seq in (seq_full[i:i+n] for i in range(0, len(seq_full), n)):
            GC_percent = get_GC_percent(seq, n)
            if not GC_percent:
                continue
            GC_distribution_y[int(int(GC_percent / qconfig.GC_bin_size) * qconfig.GC_bin_size)] += 1
        total_GC_amount += contig_GC_len
        total_contig_length += contig_ACGT_len

    if total_contig_length == 0:
        total_GC = None
    else:
        total_GC = total_GC_amount * 100.0 / total_contig_length

    return total_GC, (GC_distribution_x, GC_distribution_y), (GC_contigs_distribution_x, GC_contigs_distribution_y)
Esempio n. 22
0
def parse_uncovered_fpath(uncovered_fpath, fasta_fpath, return_covered_regions=True):
    regions = defaultdict(list)
    prev_start = defaultdict(int)
    if exists(uncovered_fpath):
        with open(uncovered_fpath) as f:
            for line in f:
                chrom, start, end = line.split('\t')
                if return_covered_regions:
                    if prev_start[chrom] != int(start):
                        regions[chrom].append((prev_start[chrom], int(start)))
                    prev_start[chrom] = int(end)
                else:
                    regions[chrom].append((int(start), int(end)))
    if return_covered_regions:
        for name, seq in fastaparser.read_fasta(fasta_fpath):
            if name in regions:
                if prev_start[name] != len(seq):
                    regions[name].append((prev_start[name], len(seq)))
            else:
                regions[name].append((0, len(seq)))
    return regions
Esempio n. 23
0
def parse_uncovered_fpath(uncovered_fpath,
                          fasta_fpath,
                          return_covered_regions=True):
    regions = defaultdict(list)
    prev_start = defaultdict(int)
    if uncovered_fpath and exists(uncovered_fpath):
        with open(uncovered_fpath) as f:
            for line in f:
                chrom, start, end = line.split('\t')
                if return_covered_regions:
                    if prev_start[chrom] != int(start):
                        regions[chrom].append((prev_start[chrom], int(start)))
                    prev_start[chrom] = int(end)
                else:
                    regions[chrom].append((int(start), int(end)))
    if return_covered_regions:
        for name, seq in fastaparser.read_fasta(fasta_fpath):
            if name in regions:
                if prev_start[name] != len(seq):
                    regions[name].append((prev_start[name], len(seq)))
            else:
                regions[name].append((0, len(seq)))
    return regions
Esempio n. 24
0
def parse_contigs_fpath(contigs_fpath):
    contigs = []
    for name, seq in fastaparser.read_fasta(contigs_fpath):
        contig = Contig(name=name, size=len(seq))
        contigs.append(contig)
    return contigs
Esempio n. 25
0
REF_MARGINS = 300
REF_FNAME = "ref.fa"

if len(sys.argv) != 4:
    print "Usage:", sys.argv[0], "reference pos1 pos2"
    sys.exit(0)

pos1 = int(sys.argv[2])
pos2 = int(sys.argv[3])

if pos1 > pos2:
    pos = pos1
    pos1 = pos2
    pos2 = pos

reference = fastaparser.read_fasta(
    sys.argv[1])[0][1]  # Returns list of FASTA entries (in tuples: name, seq)
if len(reference) < pos2:
    pos2 = len(reference)

ref_file = open(REF_FNAME, 'w')
ref_file.write(">reference\n")
ref_file.write(reference[max(0, pos1 - 1 -
                             REF_MARGINS):min(len(reference), pos2 +
                                              REF_MARGINS)] + "\n")
ref_file.close()

misassembled_site = reference[pos1 - 1:pos2]
kmers = set()

i = pos1 - 1
while i + KMER_SIZE <= pos2:
Esempio n. 26
0
def do(ref_fpath, original_ref_fpath, output_dirpath):
    logger.print_timestamp()
    logger.main_info("Simulating Optimal Assembly...")

    uncovered_fpath = None
    reads_analyzer_dir = join(dirname(output_dirpath),
                              qconfig.reads_stats_dirname)
    if qconfig.reads_fpaths or qconfig.reference_sam or qconfig.reference_bam:
        sam_fpath, bam_fpath, uncovered_fpath = reads_analyzer.align_reference(
            ref_fpath,
            reads_analyzer_dir,
            using_reads='all',
            calculate_coverage=True)
    insert_size = qconfig.optimal_assembly_insert_size
    if insert_size == 'auto' or not insert_size:
        insert_size = qconfig.optimal_assembly_default_IS

    ref_basename, fasta_ext = splitext_for_fasta_file(
        os.path.basename(ref_fpath))
    result_basename = '%s.%s.is%d.fasta' % (
        ref_basename, qconfig.optimal_assembly_basename, insert_size)
    long_reads = qconfig.pacbio_reads or qconfig.nanopore_reads
    if long_reads:
        result_basename = add_suffix(result_basename,
                                     long_reads_polished_suffix)
    elif qconfig.mate_pairs:
        result_basename = add_suffix(result_basename, mp_polished_suffix)
    result_fpath = os.path.join(output_dirpath, result_basename)

    original_ref_basename, fasta_ext = splitext_for_fasta_file(
        os.path.basename(original_ref_fpath))
    prepared_optimal_assembly_basename = '%s.%s.is%d.fasta' % (
        original_ref_basename, qconfig.optimal_assembly_basename, insert_size)
    ref_prepared_optimal_assembly = os.path.join(
        os.path.dirname(original_ref_fpath),
        prepared_optimal_assembly_basename)

    if os.path.isfile(result_fpath) or os.path.isfile(
            ref_prepared_optimal_assembly):
        already_done_fpath = result_fpath if os.path.isfile(
            result_fpath) else ref_prepared_optimal_assembly
        logger.notice(
            '  Will reuse already generated Optimal Assembly with insert size %d (%s)'
            % (insert_size, already_done_fpath))
        return already_done_fpath

    if qconfig.platform_name == 'linux_32':
        logger.warning(
            '  Sorry, can\'t create Optimal Assembly on this platform, skipping...'
        )
        return None

    red_dirpath = get_dir_for_download('red', 'Red', ['Red'], logger)
    binary_fpath = download_external_tool('Red',
                                          red_dirpath,
                                          'red',
                                          platform_specific=True,
                                          is_executable=True)
    if not binary_fpath or not os.path.isfile(binary_fpath):
        logger.warning('  Sorry, can\'t create Optimal Assembly, skipping...')
        return None

    log_fpath = os.path.join(output_dirpath, 'optimal_assembly.log')
    tmp_dir = os.path.join(output_dirpath, 'tmp')
    if os.path.isdir(tmp_dir):
        shutil.rmtree(tmp_dir)
    os.makedirs(tmp_dir)

    unique_covered_regions, repeats_regions = get_unique_covered_regions(
        ref_fpath, tmp_dir, log_fpath, binary_fpath, insert_size,
        uncovered_fpath)
    if unique_covered_regions is None:
        logger.error(
            '  Failed to create Optimal Assembly, see log for details: ' +
            log_fpath)
        return None

    reference = list(fastaparser.read_fasta(ref_fpath))
    result_fasta = []

    if long_reads or qconfig.mate_pairs:
        if long_reads:
            join_reads = 'pacbio' if qconfig.pacbio_reads else 'nanopore'
        else:
            join_reads = 'mp'
        sam_fpath, bam_fpath, _ = reads_analyzer.align_reference(
            ref_fpath, reads_analyzer_dir, using_reads=join_reads)
        joiners = get_joiners(qutils.name_from_fpath(ref_fpath), sam_fpath,
                              bam_fpath, tmp_dir, log_fpath, join_reads)
        uncovered_regions = parse_uncovered_fpath(
            uncovered_fpath, ref_fpath, return_covered_regions=False
        ) if join_reads == 'mp' else defaultdict(list)
        mp_len = calculate_read_len(sam_fpath) if join_reads == 'mp' else None
        for chrom, seq in reference:
            region_pairing = get_regions_pairing(unique_covered_regions[chrom],
                                                 joiners[chrom], mp_len)
            ref_coords_to_output = scaffolding(unique_covered_regions[chrom],
                                               region_pairing)
            get_fasta_entries_from_coords(result_fasta, (chrom, seq),
                                          ref_coords_to_output,
                                          repeats_regions[chrom],
                                          uncovered_regions[chrom])
    else:
        for chrom, seq in reference:
            for idx, region in enumerate(unique_covered_regions[chrom]):
                if region[1] - region[0] >= MIN_CONTIG_LEN:
                    result_fasta.append(
                        (chrom + '_' + str(idx), seq[region[0]:region[1]]))

    fastaparser.write_fasta(result_fpath, result_fasta)
    logger.info('  ' + 'Theoretically optimal Assembly saved to ' +
                result_fpath)
    logger.notice(
        'You can copy it to ' + ref_prepared_optimal_assembly +
        ' and QUAST will reuse it in further runs against the same reference ('
        + original_ref_fpath + ')')

    if not qconfig.debug:
        shutil.rmtree(tmp_dir)

    logger.main_info('Done.')
    return result_fpath
Esempio n. 27
0
def analyze_contigs(ca_output,
                    contigs_fpath,
                    unaligned_fpath,
                    aligns,
                    ref_features,
                    ref_lens,
                    cyclic=None):
    maxun = 10
    epsilon = 0.99
    umt = 0.5  # threshold for misassembled contigs with aligned less than $umt * 100% (Unaligned Missassembled Threshold)

    unaligned = 0
    partially_unaligned = 0
    fully_unaligned_bases = 0
    partially_unaligned_bases = 0
    ambiguous_contigs = 0
    ambiguous_contigs_extra_bases = 0
    ambiguous_contigs_len = 0
    partially_unaligned_with_misassembly = 0
    partially_unaligned_with_significant_parts = 0
    misassembly_internal_overlap = 0
    contigs_with_istranslocations = 0
    misassemblies_matched_sv = 0

    ref_aligns = dict()
    aligned_lengths = []
    region_misassemblies = []
    misassembled_contigs = dict()

    region_struct_variations = find_all_sv(qconfig.bed)

    references_misassemblies = {}
    for ref in ref_labels_by_chromosomes.values():
        references_misassemblies[ref] = dict(
            (key, 0) for key in ref_labels_by_chromosomes.values())

    # for counting SNPs and indels (both original (.all_snps) and corrected from Nucmer's local misassemblies)
    total_indels_info = IndelsInfo()

    unaligned_file = open(unaligned_fpath, 'w')
    for contig, seq in fastaparser.read_fasta(contigs_fpath):
        #Recording contig stats
        ctg_len = len(seq)
        print >> ca_output.stdout_f, 'CONTIG: %s (%dbp)' % (contig, ctg_len)
        contig_type = 'unaligned'

        #Check if this contig aligned to the reference
        if contig in aligns:
            for align in aligns[contig]:
                #sub_seq = seq[align.start(): align.end()]
                sub_seq = seq[_start(align):_end(align)]
                if 'N' in sub_seq:
                    ns_pos = [
                        pos for pos in xrange(_start(align), _end(align))
                        if seq[pos] == 'N'
                    ]
#                    ns_pos = [pos for pos in xrange(align.start(), align.end()) if seq[pos] == 'N']
            contig_type = 'correct'
            #Pull all aligns for this contig
            num_aligns = len(aligns[contig])

            #Sort aligns by aligned_length * identity - unaligned_length (as we do in BSS)
            sorted_aligns = sorted(aligns[contig],
                                   key=lambda x: (score_single_align(x), x[5]),
                                   reverse=True)
            top_len = sorted_aligns[0][5]
            top_id = sorted_aligns[0][6]
            top_score = score_single_align(sorted_aligns[0])
            top_aligns = []
            print >> ca_output.stdout_f, 'Top Length: %d  Top ID: %.2f (Score: %.1f)' % (
                top_len, top_id, top_score)

            #Check that top hit captures most of the contig
            if top_len > ctg_len * epsilon or ctg_len - top_len < maxun:
                #Reset top aligns: aligns that share the same value of longest and highest identity
                top_aligns.append(sorted_aligns[0])
                sorted_aligns = sorted_aligns[1:]

                #Continue grabbing alignments while length and identity are identical
                #while sorted_aligns and top_len == sorted_aligns[0][5] and top_id == sorted_aligns[0][6]:
                while sorted_aligns and (score_single_align(
                        sorted_aligns[0]) >=
                                         qconfig.ambiguity_score * top_score):
                    top_aligns.append(sorted_aligns[0])
                    sorted_aligns = sorted_aligns[1:]

                #Mark other alignments as insignificant (former ambiguous)
                if sorted_aligns:
                    print >> ca_output.stdout_f, '\t\tSkipping these alignments as insignificant (option --ambiguity-score is set to "%s"):' % str(
                        qconfig.ambiguity_score)
                    for align in sorted_aligns:
                        print >> ca_output.stdout_f, '\t\t\tSkipping alignment ', align

                if len(top_aligns) == 1:
                    #There is only one top align, life is good
                    print >> ca_output.stdout_f, '\t\tOne align captures most of this contig: %s' % str(
                        top_aligns[0])
                    #                    print >> ca_output.icarus_out_f, top_aligns[0].icarus_report_str()
                    print >> ca_output.icarus_out_f, icarus_report_str(
                        top_aligns[0])
                    ref_aligns.setdefault(top_aligns[0][7],
                                          []).append(top_aligns[0])
                    print >> ca_output.coords_filtered_f, str(top_aligns[0])
                    aligned_lengths.append(top_aligns[0][5])
                else:
                    #There is more than one top align
                    print >> ca_output.stdout_f, '\t\tThis contig has %d significant alignments. [An ambiguously mapped contig]' % len(
                        top_aligns)

                    #Increment count of ambiguously mapped contigs and bases in them
                    ambiguous_contigs += 1
                    # we count only extra bases, so we shouldn't include bases in the first alignment
                    # if --ambiguity-usage is 'none', the number of extra bases will be negative!
                    ambiguous_contigs_len += ctg_len

                    # Alex: skip all alignments or count them as normal (just different aligns of one repeat). Depend on --allow-ambiguity option
                    if qconfig.ambiguity_usage == "none":
                        ambiguous_contigs_extra_bases -= top_aligns[0][5]
                        print >> ca_output.stdout_f, '\t\tSkipping these alignments (option --ambiguity-usage is set to "none"):'
                        for align in top_aligns:
                            print >> ca_output.stdout_f, '\t\t\tSkipping alignment ', align
                    elif qconfig.ambiguity_usage == "one":
                        ambiguous_contigs_extra_bases += 0
                        print >> ca_output.stdout_f, '\t\tUsing only first of these alignment (option --ambiguity-usage is set to "one"):'
                        print >> ca_output.stdout_f, '\t\t\tAlignment: %s' % str(
                            top_aligns[0])
                        #                        print >> ca_output.icarus_out_f, top_aligns[0].icarus_report_str()
                        print >> ca_output.icarus_out_f, icarus_report_str(
                            top_aligns[0])
                        ref_aligns.setdefault(top_aligns[0][7],
                                              []).append(top_aligns[0])
                        aligned_lengths.append(top_aligns[0][5])
                        print >> ca_output.coords_filtered_f, str(
                            top_aligns[0])
                        top_aligns = top_aligns[1:]
                        for align in top_aligns:
                            print >> ca_output.stdout_f, '\t\t\tSkipping alignment ', align
                    elif qconfig.ambiguity_usage == "all":
                        ambiguous_contigs_extra_bases -= top_aligns[0][5]
                        print >> ca_output.stdout_f, '\t\tUsing all these alignments (option --ambiguity-usage is set to "all"):'
                        # we count only extra bases, so we shouldn't include bases in the first alignment
                        first_alignment = True
                        while len(top_aligns):
                            print >> ca_output.stdout_f, '\t\t\tAlignment: %s' % str(
                                top_aligns[0])
                            #                            print >> ca_output.icarus_out_f, top_aligns[0].icarus_report_str(ambiguity=True)
                            print >> ca_output.icarus_out_f, icarus_report_str(
                                top_aligns[0], ambiguity=True)
                            ref_aligns.setdefault(top_aligns[0][7],
                                                  []).append(top_aligns[0])
                            if first_alignment:
                                first_alignment = False
                                aligned_lengths.append(top_aligns[0][5])
                            ambiguous_contigs_extra_bases += top_aligns[0][5]
                            print >> ca_output.coords_filtered_f, str(
                                top_aligns[0]), "ambiguous"
                            top_aligns = top_aligns[1:]
            else:
                # choose appropriate alignments (to maximize total size of contig alignment and reduce # misassemblies)
                is_ambiguous, too_much_best_sets, sorted_aligns, best_sets = get_best_aligns_sets(
                    sorted_aligns, ctg_len, ca_output.stdout_f, seq, ref_lens,
                    cyclic, region_struct_variations)
                the_best_set = best_sets[0]
                used_indexes = range(
                    len(sorted_aligns)
                ) if too_much_best_sets else get_used_indexes(best_sets)
                if len(used_indexes) < len(sorted_aligns):
                    print >> ca_output.stdout_f, '\t\t\tSkipping redundant alignments after choosing the best set of alignments'
                    for idx in set(range(len(sorted_aligns))) - used_indexes:
                        print >> ca_output.stdout_f, '\t\tSkipping redundant alignment', sorted_aligns[
                            idx]

                if is_ambiguous:
                    print >> ca_output.stdout_f, '\t\tThis contig has several significant sets of alignments. [An ambiguously mapped contig]'
                    # similar to regular ambiguous contigs, see above
                    ambiguous_contigs += 1
                    ambiguous_contigs_len += ctg_len

                    if qconfig.ambiguity_usage == "none":
                        ambiguous_contigs_extra_bases -= (
                            ctg_len - the_best_set.uncovered)
                        print >> ca_output.stdout_f, '\t\tSkipping all alignments in these sets (option --ambiguity-usage is set to "none"):'
                        for idx in used_indexes:
                            print >> ca_output.stdout_f, '\t\t\tSkipping alignment ', sorted_aligns[
                                idx]
                        continue
                    elif qconfig.ambiguity_usage == "one":
                        ambiguous_contigs_extra_bases += 0
                        print >> ca_output.stdout_f, '\t\tUsing only the very best set (option --ambiguity-usage is set to "one").'
                        if len(the_best_set.indexes) < len(used_indexes):
                            print >> ca_output.stdout_f, '\t\tSo, skipping alignments from other sets:'
                            for idx in used_indexes:
                                if idx not in the_best_set.indexes:
                                    print >> ca_output.stdout_f, '\t\t\tSkipping alignment ', sorted_aligns[
                                        idx]
                    elif qconfig.ambiguity_usage == "all":
                        print >> ca_output.stdout_f, '\t\tUsing all alignments in these sets (option --ambiguity-usage is set to "all"):'
                        print >> ca_output.stdout_f, '\t\t\tThe very best set is shown in details below, the rest are:'
                        for idx, cur_set in enumerate(best_sets[1:]):
                            print >> ca_output.stdout_f, '\t\t\t\tGroup #%d. Score: %.1f, number of alignments: %d, unaligned bases: %d' % \
                                (idx + 2, cur_set.score, len(cur_set.indexes), cur_set.uncovered)
                        if too_much_best_sets:
                            print >> ca_output.stdout_f, '\t\t\t\tetc...'
                        if len(the_best_set.indexes) < len(used_indexes):
                            ambiguous_contigs_extra_bases -= (
                                ctg_len - the_best_set.uncovered)
                            print >> ca_output.stdout_f, '\t\t\tList of alignments used in the sets above:'
                            for idx in used_indexes:
                                align = sorted_aligns[idx]
                                print >> ca_output.stdout_f, '\t\tAlignment: %s' % str(
                                    align)
                                ref_aligns.setdefault(align[7],
                                                      []).append(align)
                                ambiguous_contigs_extra_bases += align[5]
                                print >> ca_output.coords_filtered_f, str(
                                    align), "ambiguous"
                                if idx not in the_best_set.indexes:
                                    print >> ca_output.icarus_out_f, icarus_report_str(
                                        align, is_best=False)
#                                    print >> ca_output.icarus_out_f, align.icarus_report_str(is_best=False)

                print >> ca_output.stdout_f, '\t\t\tThe best set is below. Score: %.1f, number of alignments: %d, unaligned bases: %d' % \
                                             (the_best_set.score, len(the_best_set.indexes), the_best_set.uncovered)
                real_aligns = [sorted_aligns[i] for i in the_best_set.indexes]

                # main processing part
                if len(real_aligns) == 1:
                    the_only_align = real_aligns[0]

                    #There is only one alignment of this contig to the reference
                    print >> ca_output.coords_filtered_f, str(the_only_align)
                    aligned_lengths.append(the_only_align[5])

                    #                    begin, end = the_only_align.start(), the_only_align.end()
                    begin, end = _start(the_only_align), _end(the_only_align)
                    unaligned_bases = 0
                    if (begin - 1) or (ctg_len - end):
                        partially_unaligned += 1
                        unaligned_bases = (begin - 1) + (ctg_len - end)
                        partially_unaligned_bases += unaligned_bases
                        print >> ca_output.stdout_f, '\t\tThis contig is partially unaligned. (Aligned %d out of %d bases)' % (
                            top_len, ctg_len)
                    print >> ca_output.stdout_f, '\t\tAlignment: %s' % str(
                        the_only_align)
                    #                    print >> ca_output.icarus_out_f, the_only_align.icarus_report_str()
                    print >> ca_output.icarus_out_f, icarus_report_str(
                        the_only_align)
                    if begin - 1:
                        print >> ca_output.stdout_f, '\t\tUnaligned bases: 1 to %d (%d)' % (
                            begin - 1, begin - 1)
                    if ctg_len - end:
                        print >> ca_output.stdout_f, '\t\tUnaligned bases: %d to %d (%d)' % (
                            end + 1, ctg_len, ctg_len - end)
                    # check if both parts (aligned and unaligned) have significant length
                    if (unaligned_bases >= qconfig.significant_part_size) and (
                            ctg_len - unaligned_bases >=
                            qconfig.significant_part_size):
                        print >> ca_output.stdout_f, '\t\tThis contig has both significant aligned and unaligned parts ' \
                                                     '(of length >= %d)!' % (qconfig.significant_part_size)
                        partially_unaligned_with_significant_parts += 1
                        if qconfig.meta:
                            contigs_with_istranslocations += check_for_potential_translocation(
                                seq, ctg_len, real_aligns, ca_output.stdout_f)
                    ref_aligns.setdefault(the_only_align[7],
                                          []).append(the_only_align)
                else:
                    #Sort real alignments by position on the contig
                    sorted_aligns = sorted(real_aligns,
                                           key=lambda x: (_end(x), _start(x)))
                    #                    sorted_aligns = sorted(real_aligns, key=lambda x: (x.end(), x.start()))

                    #There is more than one alignment of this contig to the reference
                    print >> ca_output.stdout_f, '\t\tThis contig is misassembled. %d total aligns.' % num_aligns
                    aligned_bases_in_contig = ctg_len - the_best_set.uncovered

                    if aligned_bases_in_contig < umt * ctg_len:
                        print >> ca_output.stdout_f, '\t\t\tWarning! This contig is more unaligned than misassembled. ' + \
                            'Contig length is %d and total length of all aligns is %d' % (ctg_len, aligned_bases_in_contig)
                        for align in sorted_aligns:
                            print >> ca_output.stdout_f, '\t\tAlignment: %s' % str(
                                align)
                            #                            print >> ca_output.icarus_out_f, align.icarus_report_str()
                            print >> ca_output.icarus_out_f, icarus_report_str(
                                align)
                            print >> ca_output.coords_filtered_f, str(align)
                            aligned_lengths.append(align[5])
                            ref_aligns.setdefault(align[7], []).append(align)

                        partially_unaligned_with_misassembly += 1
                        partially_unaligned += 1
                        partially_unaligned_bases += ctg_len - aligned_bases_in_contig
                        print >> ca_output.stdout_f, '\t\tUnaligned bases: %d' % (
                            ctg_len - aligned_bases_in_contig)
                        # check if both parts (aligned and unaligned) have significant length
                        if (aligned_bases_in_contig >=
                                qconfig.significant_part_size) and (
                                    ctg_len - aligned_bases_in_contig >=
                                    qconfig.significant_part_size):
                            print >> ca_output.stdout_f, '\t\tThis contig has both significant aligned and unaligned parts ' \
                                                         '(of length >= %d)!' % (qconfig.significant_part_size)
                            partially_unaligned_with_significant_parts += 1
                            if qconfig.meta:
                                contigs_with_istranslocations += check_for_potential_translocation(
                                    seq, ctg_len, sorted_aligns,
                                    ca_output.stdout_f)
                        contig_type = 'misassembled'
                        print >> ca_output.icarus_out_f, '\t'.join(
                            ['CONTIG', contig,
                             str(ctg_len), contig_type])
                        print >> ca_output.stdout_f
                        continue

                    ### processing misassemblies
                    is_misassembled, current_mio, references_misassemblies, indels_info, misassemblies_matched_sv = \
                        process_misassembled_contig(sorted_aligns, cyclic, aligned_lengths, region_misassemblies,
                                                    ref_lens, ref_aligns, ref_features, seq, references_misassemblies,
                                                    region_struct_variations, misassemblies_matched_sv, ca_output,
                                                    is_ambiguous)
                    misassembly_internal_overlap += current_mio
                    total_indels_info += indels_info
                    if is_misassembled:
                        misassembled_contigs[contig] = ctg_len
                        contig_type = 'misassembled'
                    if ctg_len - aligned_bases_in_contig >= qconfig.significant_part_size:
                        print >> ca_output.stdout_f, '\t\tThis contig has significant unaligned parts ' \
                                                     '(of length >= %d)!' % (qconfig.significant_part_size)
                        if qconfig.meta:
                            contigs_with_istranslocations += check_for_potential_translocation(
                                seq, ctg_len, sorted_aligns,
                                ca_output.stdout_f)
        else:
            #No aligns to this contig
            print >> ca_output.stdout_f, '\t\tThis contig is unaligned. (%d bp)' % ctg_len
            print >> unaligned_file, contig

            #Increment unaligned contig count and bases
            unaligned += 1
            fully_unaligned_bases += ctg_len
            print >> ca_output.stdout_f, '\t\tUnaligned bases: %d  total: %d' % (
                ctg_len, fully_unaligned_bases)

        print >> ca_output.icarus_out_f, '\t'.join(
            ['CONTIG', contig, str(ctg_len), contig_type])
        print >> ca_output.stdout_f

    ca_output.coords_filtered_f.close()
    unaligned_file.close()
    misassembled_bases = sum(misassembled_contigs.itervalues())

    result = {
        'region_misassemblies':
        region_misassemblies,
        'region_struct_variations':
        region_struct_variations.get_count()
        if region_struct_variations else None,
        'misassemblies_matched_sv':
        misassemblies_matched_sv,
        'misassembled_contigs':
        misassembled_contigs,
        'misassembled_bases':
        misassembled_bases,
        'misassembly_internal_overlap':
        misassembly_internal_overlap,
        'unaligned':
        unaligned,
        'partially_unaligned':
        partially_unaligned,
        'partially_unaligned_bases':
        partially_unaligned_bases,
        'fully_unaligned_bases':
        fully_unaligned_bases,
        'ambiguous_contigs':
        ambiguous_contigs,
        'ambiguous_contigs_extra_bases':
        ambiguous_contigs_extra_bases,
        'ambiguous_contigs_len':
        ambiguous_contigs_len,
        'partially_unaligned_with_misassembly':
        partially_unaligned_with_misassembly,
        'partially_unaligned_with_significant_parts':
        partially_unaligned_with_significant_parts,
        'contigs_with_istranslocations':
        contigs_with_istranslocations,
        'istranslocations_by_refs':
        references_misassemblies
    }

    return result, ref_aligns, total_indels_info, aligned_lengths, misassembled_contigs
Esempio n. 28
0
def do(output_dir, ref_fpath, contigs_fpaths, logger):
    logger.print_timestamp()
    logger.main_info('Running analysis based on unique ' + str(KMERS_LEN) + '-mers...')

    checked_assemblies = []
    for contigs_fpath in contigs_fpaths:
        label = qutils.label_from_fpath_for_fname(contigs_fpath)
        if check_kmc_successful_check(output_dir, contigs_fpath, contigs_fpaths, ref_fpath):
            kmc_stats_fpath = join(output_dir, label + '.stat')
            stats_content = open(kmc_stats_fpath).read().split('\n')
            if len(stats_content) < 1:
                continue
            logger.info('  Using existing results for ' + label + '... ')
            report = reporting.get(contigs_fpath)
            report.add_field(reporting.Fields.KMER_COMPLETENESS, '%.2f' % float(stats_content[0].strip().split(': ')[-1]))
            if len(stats_content) >= 5:
                len_map_to_one_chrom = int(stats_content[1].strip().split(': ')[-1])
                len_map_to_multi_chrom = int(stats_content[2].strip().split(': ')[-1])
                len_map_to_none_chrom = int(stats_content[3].strip().split(': ')[-1])
                total_len = int(stats_content[4].strip().split(': ')[-1])
                report.add_field(reporting.Fields.KMER_SCAFFOLDS_ONE_CHROM, '%.2f' % (len_map_to_one_chrom * 100.0 / total_len))
                report.add_field(reporting.Fields.KMER_SCAFFOLDS_MULTI_CHROM, '%.2f' % (len_map_to_multi_chrom * 100.0 / total_len))
                report.add_field(reporting.Fields.KMER_SCAFFOLDS_NONE_CHROM, '%.2f' % (len_map_to_none_chrom * 100.0 / total_len))
            checked_assemblies.append(contigs_fpath)

    contigs_fpaths = [fpath for fpath in contigs_fpaths if fpath not in checked_assemblies]
    if len(contigs_fpaths) == 0:
        logger.info('Done.')
        return

    if not exists(kmc_bin_fpath) or not exists(kmc_tools_fpath):
        logger.warning('  Sorry, can\'t run KMC on this platform, skipping...')
        return None

    logger.info('Running KMC on reference...')
    log_fpath = join(output_dir, 'kmc.log')
    err_fpath = join(output_dir, 'kmc.err')
    open(log_fpath, 'w').close()
    open(err_fpath, 'w').close()

    tmp_dirpath = join(output_dir, 'tmp')
    if not isdir(tmp_dirpath):
        os.makedirs(tmp_dirpath)
    ref_kmc_out_fpath = count_kmers(tmp_dirpath, ref_fpath, log_fpath, err_fpath)
    unique_kmers = get_kmers_cnt(tmp_dirpath, ref_kmc_out_fpath, log_fpath, err_fpath)
    if not unique_kmers:
        return

    logger.info('Analyzing assemblies completeness...')
    kmc_out_fpaths = []
    for contigs_fpath in contigs_fpaths:
        report = reporting.get(contigs_fpath)
        kmc_out_fpath = count_kmers(tmp_dirpath, contigs_fpath, log_fpath, err_fpath)
        intersect_out_fpath = intersect_kmers(tmp_dirpath, [ref_kmc_out_fpath, kmc_out_fpath], log_fpath, err_fpath)
        matched_kmers = get_kmers_cnt(tmp_dirpath, intersect_out_fpath, log_fpath, err_fpath)
        completeness = matched_kmers * 100.0 / unique_kmers
        report.add_field(reporting.Fields.KMER_COMPLETENESS, '%.2f' % completeness)
        kmc_out_fpaths.append(intersect_out_fpath)

    logger.info('Analyzing assemblies accuracy...')
    if len(kmc_out_fpaths) > 1:
        shared_kmc_db = intersect_kmers(tmp_dirpath, kmc_out_fpaths, log_fpath, err_fpath)
    else:
        shared_kmc_db = kmc_out_fpaths[0]

    kmer_fraction = 100 if getsize(ref_fpath) < 500 * 1024 ** 2 else 1000

    shared_downsampled_kmc_db = downsample_kmers(tmp_dirpath, shared_kmc_db, log_fpath, err_fpath, kmer_fraction=kmer_fraction)

    shared_kmers_by_chrom = dict()
    shared_kmers_fpath = join(tmp_dirpath, 'shared_kmers.txt')
    ref_contigs = dict((name, seq) for name, seq in read_fasta(ref_fpath))
    with open(shared_kmers_fpath, 'w') as out_f:
        for name, seq in ref_contigs.items():
            seq_kmers = get_string_kmers(tmp_dirpath, log_fpath, err_fpath, seq=seq, intersect_with=shared_downsampled_kmc_db)
            for kmer_i, kmer in enumerate(seq_kmers):
                shared_kmers_by_chrom[str(kmer)] = name
                out_f.write('>' + str(kmer_i) + '\n')
                out_f.write(kmer + '\n')

    shared_kmc_db = count_kmers(tmp_dirpath, shared_kmers_fpath, log_fpath, err_fpath)
    ref_kmc_dbs = []
    for ref_name, ref_seq in ref_contigs.items():
        ref_contig_fpath = join(tmp_dirpath, ref_name + '.fa')
        if not is_non_empty_file(ref_contig_fpath):
            with open(ref_contig_fpath, 'w') as out_f:
                out_f.write(ref_seq)
        ref_kmc_db = count_kmers(tmp_dirpath, ref_contig_fpath, log_fpath, err_fpath)
        ref_shared_kmc_db = intersect_kmers(tmp_dirpath, [ref_kmc_db, shared_kmc_db], log_fpath, err_fpath)
        ref_kmc_dbs.append((ref_name, ref_shared_kmc_db))

    for contigs_fpath in contigs_fpaths:
        report = reporting.get(contigs_fpath)
        len_map_to_one_chrom = None
        len_map_to_multi_chrom = None
        len_map_to_none_chrom = None
        total_len = 0
        long_contigs = []
        contig_lens = dict()
        contig_markers = defaultdict(list)
        for name, seq in read_fasta(contigs_fpath):
            total_len += len(seq)
            contig_lens[name] = len(seq)
            if len(seq) >= MIN_CONTIGS_LEN:
                long_contigs.append(len(seq))

        if len(long_contigs) > MAX_CONTIGS_NUM or sum(long_contigs) < total_len * 0.5:
            logger.warning('Assembly is too fragmented. Scaffolding accuracy will not be assessed.')
        elif len(ref_kmc_dbs) > MAX_CONTIGS_NUM:
            logger.warning('Reference is too fragmented. Scaffolding accuracy will not be assessed.')
        else:
            len_map_to_one_chrom = 0
            len_map_to_multi_chrom = 0
            for name, seq in read_fasta(contigs_fpath):
                if len(seq) < MIN_CONTIGS_LEN:
                    continue

                tmp_contig_fpath = join(tmp_dirpath, name + '.fa')
                with open(tmp_contig_fpath, 'w') as out_tmp_f:
                    out_tmp_f.write(seq)
                contig_kmc_db = count_kmers(tmp_dirpath, tmp_contig_fpath, log_fpath, err_fpath)
                intersect_all_ref_kmc_db = intersect_kmers(tmp_dirpath, [contig_kmc_db, shared_kmc_db], log_fpath, err_fpath)
                kmers_cnt = get_kmers_cnt(tmp_dirpath, intersect_all_ref_kmc_db, log_fpath, err_fpath)
                if kmers_cnt < MIN_MARKERS:
                    continue
                for ref_name, ref_kmc_db in ref_kmc_dbs:
                    intersect_kmc_db = intersect_kmers(tmp_dirpath, [ref_kmc_db, intersect_all_ref_kmc_db], log_fpath, err_fpath)
                    kmers_cnt = get_kmers_cnt(tmp_dirpath, intersect_kmc_db, log_fpath, err_fpath)
                    if kmers_cnt:
                        contig_markers[name].append(ref_name)
            for name, chr_markers in contig_markers.items():
                if len(chr_markers) == 1:
                    len_map_to_one_chrom += contig_lens[name]
                else:
                    len_map_to_multi_chrom += contig_lens[name]
            len_map_to_none_chrom = total_len - len_map_to_one_chrom - len_map_to_multi_chrom
            report.add_field(reporting.Fields.KMER_SCAFFOLDS_ONE_CHROM, '%.2f' % (len_map_to_one_chrom * 100.0 / total_len))
            report.add_field(reporting.Fields.KMER_SCAFFOLDS_MULTI_CHROM, '%.2f' % (len_map_to_multi_chrom * 100.0 / total_len))
            report.add_field(reporting.Fields.KMER_SCAFFOLDS_NONE_CHROM, '%.2f' % (len_map_to_none_chrom * 100.0 / total_len))

        create_kmc_stats_file(output_dir, contigs_fpath, contigs_fpaths, ref_fpath,
                             report.get_field(reporting.Fields.KMER_COMPLETENESS),
                             len_map_to_one_chrom, len_map_to_multi_chrom, len_map_to_none_chrom, total_len)

    if not qconfig.debug:
        shutil.rmtree(tmp_dirpath)
    logger.info('Done.')
Esempio n. 29
0
def do(ref_fpath, original_ref_fpath, output_dirpath):
    logger.print_timestamp()
    logger.main_info("Generating Upper Bound Assembly...")

    if not reads_analyzer.compile_reads_analyzer_tools(logger):
        logger.warning(
            '  Sorry, can\'t create Upper Bound Assembly '
            '(failed to compile necessary third-party read processing tools [bwa, bedtools, minimap2]), skipping...'
        )
        return None

    if qconfig.platform_name == 'linux_32':
        logger.warning(
            '  Sorry, can\'t create Upper Bound Assembly on this platform '
            '(only linux64 and macOS are supported), skipping...')
        return None

    red_dirpath = get_dir_for_download('red', 'Red', ['Red'], logger)
    binary_fpath = download_external_tool('Red',
                                          red_dirpath,
                                          'red',
                                          platform_specific=True,
                                          is_executable=True)
    if not binary_fpath or not os.path.isfile(binary_fpath):
        logger.warning(
            '  Sorry, can\'t create Upper Bound Assembly '
            '(failed to install/download third-party repeat finding tool [Red]), skipping...'
        )
        return None

    insert_size = qconfig.optimal_assembly_insert_size
    if insert_size == 'auto' or not insert_size:
        insert_size = qconfig.optimal_assembly_default_IS

    ref_basename, fasta_ext = splitext_for_fasta_file(
        os.path.basename(ref_fpath))
    result_basename = '%s.%s.is%d.fasta' % (
        ref_basename, qconfig.optimal_assembly_basename, insert_size)
    long_reads = qconfig.pacbio_reads or qconfig.nanopore_reads
    if long_reads:
        result_basename = add_suffix(result_basename,
                                     long_reads_polished_suffix)
    elif qconfig.mate_pairs:
        result_basename = add_suffix(result_basename, mp_polished_suffix)
    result_fpath = os.path.join(output_dirpath, result_basename)

    original_ref_basename, fasta_ext = splitext_for_fasta_file(
        os.path.basename(original_ref_fpath))
    prepared_optimal_assembly_basename = '%s.%s.is%d.fasta' % (
        original_ref_basename, qconfig.optimal_assembly_basename, insert_size)
    if long_reads:
        prepared_optimal_assembly_basename = add_suffix(
            prepared_optimal_assembly_basename, long_reads_polished_suffix)
    elif qconfig.mate_pairs:
        prepared_optimal_assembly_basename = add_suffix(
            prepared_optimal_assembly_basename, mp_polished_suffix)
    ref_prepared_optimal_assembly = os.path.join(
        os.path.dirname(original_ref_fpath),
        prepared_optimal_assembly_basename)
    already_done_fpath = check_prepared_optimal_assembly(
        insert_size, result_fpath, ref_prepared_optimal_assembly)
    if already_done_fpath:
        return already_done_fpath

    uncovered_fpath = None
    reads_analyzer_dir = join(dirname(output_dirpath),
                              qconfig.reads_stats_dirname)
    if qconfig.reads_fpaths or qconfig.reference_sam or qconfig.reference_bam:
        sam_fpath, bam_fpath, uncovered_fpath = reads_analyzer.align_reference(
            ref_fpath,
            reads_analyzer_dir,
            using_reads='all',
            calculate_coverage=True)

    if qconfig.optimal_assembly_insert_size != 'auto' and qconfig.optimal_assembly_insert_size != insert_size:
        calculated_insert_size = qconfig.optimal_assembly_insert_size
        result_fpath = result_fpath.replace('is' + str(insert_size),
                                            'is' + str(calculated_insert_size))
        prepared_optimal_assembly_basename = prepared_optimal_assembly_basename.replace(
            'is' + str(insert_size), 'is' + str(calculated_insert_size))
        insert_size = calculated_insert_size
        ref_prepared_optimal_assembly = os.path.join(
            os.path.dirname(original_ref_fpath),
            prepared_optimal_assembly_basename)
        already_done_fpath = check_prepared_optimal_assembly(
            insert_size, result_fpath, ref_prepared_optimal_assembly)
        if already_done_fpath:
            return already_done_fpath

    log_fpath = os.path.join(output_dirpath, 'upper_bound_assembly.log')
    tmp_dir = os.path.join(output_dirpath, 'tmp')
    if os.path.isdir(tmp_dir):
        shutil.rmtree(tmp_dir)
    os.makedirs(tmp_dir)

    unique_covered_regions, repeats_regions = get_unique_covered_regions(
        ref_fpath,
        tmp_dir,
        log_fpath,
        binary_fpath,
        insert_size,
        uncovered_fpath,
        use_long_reads=long_reads)
    if unique_covered_regions is None:
        logger.error(
            '  Failed to create Upper Bound Assembly, see log for details: ' +
            log_fpath)
        return None

    reference = list(fastaparser.read_fasta(ref_fpath))
    result_fasta = []

    if long_reads or qconfig.mate_pairs:
        if long_reads:
            join_reads = 'pacbio' if qconfig.pacbio_reads else 'nanopore'
        else:
            join_reads = 'mp'
        sam_fpath, bam_fpath, _ = reads_analyzer.align_reference(
            ref_fpath, reads_analyzer_dir, using_reads=join_reads)
        joiners = get_joiners(qutils.name_from_fpath(ref_fpath), sam_fpath,
                              bam_fpath, tmp_dir, log_fpath, join_reads)
        uncovered_regions = parse_bed(
            uncovered_fpath) if join_reads == 'mp' else defaultdict(list)
        mp_len = calculate_read_len(sam_fpath) if join_reads == 'mp' else None
        for chrom, seq in reference:
            region_pairing = get_regions_pairing(unique_covered_regions[chrom],
                                                 joiners[chrom], mp_len)
            ref_coords_to_output = scaffolding(unique_covered_regions[chrom],
                                               region_pairing)
            get_fasta_entries_from_coords(result_fasta, (chrom, seq),
                                          ref_coords_to_output,
                                          repeats_regions[chrom],
                                          uncovered_regions[chrom])
    else:
        for chrom, seq in reference:
            for idx, region in enumerate(unique_covered_regions[chrom]):
                if region[1] - region[0] >= MIN_CONTIG_LEN:
                    result_fasta.append(
                        (chrom + '_' + str(idx), seq[region[0]:region[1]]))

    fastaparser.write_fasta(result_fpath, result_fasta)
    logger.info('  ' + 'Theoretical Upper Bound Assembly is saved to ' +
                result_fpath)
    logger.notice(
        '(on reusing *this* Upper Bound Assembly in the *future* evaluations on *the same* dataset)\n'
        '\tThe next time, you can simply provide this file as an additional assembly (you could also rename it to UpperBound.fasta for the clarity). '
        'In this case, you do not need to specify --upper-bound-assembly and provide files with reads (--pe1/pe2, etc).\n'
        '\t\tOR\n'
        '\tYou can copy ' + result_fpath + ' to ' +
        ref_prepared_optimal_assembly + '. '
        'The next time you evaluate assemblies with --upper-bound-assembly option and against the same reference ('
        + original_ref_fpath + ') and '
        'the same reads (or if you specify the insert size of the paired-end reads explicitly with --est-insert-size '
        + str(insert_size) + '), '
        'QUAST will reuse this Upper Bound Assembly.\n')

    if not qconfig.debug:
        shutil.rmtree(tmp_dir)

    logger.main_info('Done.')
    return result_fpath
Esempio n. 30
0
        " <input fasta (scaffolds)> <THRESHOLD> <output fasta (contigs)> (to break contigs on Ns regions of size >= THRESHOLD)"
    )
    sys.exit()

BREAK_SCAFFOLDS = False
if len(sys.argv) == 4:
    BREAK_SCAFFOLDS = True

N_NUMBER = None
counter = 0
if BREAK_SCAFFOLDS:
    N_NUMBER = int(sys.argv[2])

sizes_of_Ns_regions = dict()
new_fasta = []
for id, (name, seq) in enumerate(fastaparser.read_fasta(sys.argv[1])):
    i = 0
    cur_contig_number = 1
    cur_contig_start = 0
    while (i < len(seq)) and (seq.find("N", i) != -1):
        start = seq.find("N", i)
        end = start + 1
        while (end != len(seq)) and (seq[end] == 'N'):
            end += 1

        i = end + 1
        if BREAK_SCAFFOLDS and (end - start) >= N_NUMBER:
            new_fasta.append((name.split()[0] + "_" + str(cur_contig_number),
                              seq[cur_contig_start:start]))
            cur_contig_number += 1
            cur_contig_start = end
Esempio n. 31
0
def align_and_analyze(is_cyclic,
                      index,
                      contigs_fpath,
                      output_dirpath,
                      ref_fpath,
                      reference_chromosomes,
                      ns_by_chromosomes,
                      old_contigs_fpath,
                      bed_fpath,
                      threads=1):
    tmp_output_dirpath = create_minimap_output_dir(output_dirpath)
    assembly_label = qutils.label_from_fpath(contigs_fpath)
    corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath)
    out_basename = join(tmp_output_dirpath, corr_assembly_label)

    logger.info('  ' + qutils.index_to_str(index) + assembly_label)

    if not qconfig.space_efficient:
        log_out_fpath = join(
            output_dirpath,
            qconfig.contig_report_fname_pattern % corr_assembly_label +
            '.stdout')
        log_err_fpath = join(
            output_dirpath,
            qconfig.contig_report_fname_pattern % corr_assembly_label +
            '.stderr')
        icarus_out_fpath = join(
            output_dirpath,
            qconfig.icarus_report_fname_pattern % corr_assembly_label)
        misassembly_fpath = join(
            output_dirpath,
            qconfig.contig_report_fname_pattern % corr_assembly_label +
            '.mis_contigs.info')
        unaligned_info_fpath = join(
            output_dirpath,
            qconfig.contig_report_fname_pattern % corr_assembly_label +
            '.unaligned.info')
    else:
        log_out_fpath = '/dev/null'
        log_err_fpath = '/dev/null'
        icarus_out_fpath = '/dev/null'
        misassembly_fpath = '/dev/null'
        unaligned_info_fpath = '/dev/null'

    icarus_out_f = open(icarus_out_fpath, 'w')
    icarus_header_cols = [
        'S1', 'E1', 'S2', 'E2', 'Reference', 'Contig', 'IDY', 'Ambiguous',
        'Best_group'
    ]
    icarus_out_f.write('\t'.join(icarus_header_cols) + '\n')
    misassembly_f = open(misassembly_fpath, 'w')

    if not qconfig.space_efficient:
        logger.info('  ' + qutils.index_to_str(index) + 'Logging to files ' +
                    log_out_fpath + ' and ' + os.path.basename(log_err_fpath) +
                    '...')
    else:
        logger.info('  ' + qutils.index_to_str(index) + 'Logging is disabled.')

    coords_fpath, coords_filtered_fpath, unaligned_fpath, used_snps_fpath = get_aux_out_fpaths(
        out_basename)
    status = align_contigs(coords_fpath, out_basename, ref_fpath,
                           contigs_fpath, old_contigs_fpath, index, threads,
                           log_out_fpath, log_err_fpath)
    if status != AlignerStatus.OK:
        with open(log_err_fpath, 'a') as log_err_f:
            if status == AlignerStatus.ERROR:
                logger.error(
                    '  ' + qutils.index_to_str(index) +
                    'Failed aligning contigs ' +
                    qutils.label_from_fpath(contigs_fpath) +
                    ' to the reference (non-zero exit code). ' +
                    ('Run with the --debug flag to see additional information.'
                     if not qconfig.debug else ''))
            elif status == AlignerStatus.FAILED:
                log_err_f.write(
                    qutils.index_to_str(index) + 'Alignment failed for ' +
                    contigs_fpath + ':' + coords_fpath + 'doesn\'t exist.\n')
                logger.info('  ' + qutils.index_to_str(index) +
                            'Alignment failed for ' + '\'' + assembly_label +
                            '\'.')
            elif status == AlignerStatus.NOT_ALIGNED:
                log_err_f.write(
                    qutils.index_to_str(index) + 'Nothing aligned for ' +
                    contigs_fpath + '\n')
                logger.info('  ' + qutils.index_to_str(index) +
                            'Nothing aligned for ' + '\'' + assembly_label +
                            '\'.')
        return status, {}, [], [], []

    log_out_f = open(log_out_fpath, 'a')
    # Loading the alignment files
    log_out_f.write('Parsing coords...\n')
    aligns = {}
    with open(coords_fpath) as coords_file:
        for line in coords_file:
            mapping = Mapping.from_line(line)
            aligns.setdefault(mapping.contig, []).append(mapping)

    # Loading the reference sequences
    log_out_f.write('Loading reference...\n')  # TODO: move up
    ref_features = {}

    # Loading the regions (if any)
    regions = {}
    total_reg_len = 0
    total_regions = 0
    # # TODO: gff
    # log_out_f.write('Loading regions...\n')
    # log_out_f.write('\tNo regions given, using whole reference.\n')
    for name, seq_len in reference_chromosomes.items():
        log_out_f.write('\tLoaded [%s]\n' % name)
        regions.setdefault(name, []).append([1, seq_len])
        total_regions += 1
        total_reg_len += seq_len
    log_out_f.write('\tTotal Regions: %d\n' % total_regions)
    log_out_f.write('\tTotal Region Length: %d\n' % total_reg_len)

    ca_output = CAOutput(stdout_f=log_out_f,
                         misassembly_f=misassembly_f,
                         coords_filtered_f=open(coords_filtered_fpath, 'w'),
                         icarus_out_f=icarus_out_f)

    log_out_f.write('Analyzing contigs...\n')
    result, ref_aligns, total_indels_info, aligned_lengths, misassembled_contigs, misassemblies_in_contigs, aligned_lengths_by_contigs =\
        analyze_contigs(ca_output, contigs_fpath, unaligned_fpath, unaligned_info_fpath, aligns, ref_features, reference_chromosomes, is_cyclic)

    log_out_f.write('Analyzing coverage...\n')
    if qconfig.show_snps:
        log_out_f.write('Writing SNPs into ' + used_snps_fpath + '\n')
    total_aligned_bases, indels_info = analyze_coverage(
        ref_aligns, reference_chromosomes, ns_by_chromosomes, used_snps_fpath)
    total_indels_info += indels_info
    cov_stats = {
        'SNPs': total_indels_info.mismatches,
        'indels_list': total_indels_info.indels_list,
        'total_aligned_bases': total_aligned_bases
    }
    result.update(cov_stats)
    result = print_results(contigs_fpath, log_out_f, used_snps_fpath,
                           total_indels_info, result)

    if not qconfig.space_efficient:
        ## outputting misassembled contigs to separate file
        fasta = [(name, seq)
                 for name, seq in fastaparser.read_fasta(contigs_fpath)
                 if name in misassembled_contigs.keys()]
        fastaparser.write_fasta(
            join(output_dirpath,
                 qutils.name_from_fpath(contigs_fpath) + '.mis_contigs.fa'),
            fasta)

    if qconfig.is_combined_ref:
        alignment_tsv_fpath = join(
            output_dirpath, "alignments_" + corr_assembly_label + '.tsv')
        unique_contigs_fpath = join(
            output_dirpath,
            qconfig.unique_contigs_fname_pattern % corr_assembly_label)
        logger.debug('  ' + qutils.index_to_str(index) + 'Alignments: ' +
                     qutils.relpath(alignment_tsv_fpath))
        used_contigs = set()
        with open(unique_contigs_fpath, 'w') as unique_contigs_f:
            with open(alignment_tsv_fpath, 'w') as alignment_tsv_f:
                for chr_name, aligns in ref_aligns.items():
                    alignment_tsv_f.write(chr_name)
                    contigs = set([align.contig for align in aligns])
                    for contig in contigs:
                        alignment_tsv_f.write('\t' + contig)

                    if qconfig.is_combined_ref:
                        ref_name = ref_labels_by_chromosomes[chr_name]
                        align_by_contigs = defaultdict(int)
                        for align in aligns:
                            align_by_contigs[align.contig] += align.len2
                        for contig, aligned_len in align_by_contigs.items():
                            if contig in used_contigs:
                                continue
                            used_contigs.add(contig)
                            len_cov_pattern = re.compile(
                                r'_length_([\d\.]+)_cov_([\d\.]+)')
                            if len_cov_pattern.findall(contig):
                                contig_len = len_cov_pattern.findall(
                                    contig)[0][0]
                                contig_cov = len_cov_pattern.findall(
                                    contig)[0][1]
                                if aligned_len / float(contig_len) > 0.9:
                                    unique_contigs_f.write(ref_name + '\t' +
                                                           str(aligned_len) +
                                                           '\t' + contig_cov +
                                                           '\n')
                    alignment_tsv_f.write('\n')

    close_handlers(ca_output)
    logger.info('  ' + qutils.index_to_str(index) + 'Analysis is finished.')
    logger.debug('')
    if not ref_aligns:
        return AlignerStatus.NOT_ALIGNED, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs
    else:
        return AlignerStatus.OK, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs
Esempio n. 32
0
def analyze_contigs(ca_output, contigs_fpath, unaligned_fpath, unaligned_info_fpath, aligns, ref_features, ref_lens,
                    is_cyclic=None):
    maxun = 10
    epsilon = 0.99

    unaligned = 0
    partially_unaligned = 0
    fully_unaligned_bases = 0
    partially_unaligned_bases = 0
    ambiguous_contigs = 0
    ambiguous_contigs_extra_bases = 0
    ambiguous_contigs_len = 0
    half_unaligned_with_misassembly = 0
    misassembly_internal_overlap = 0

    ref_aligns = dict()
    contigs_aligned_lengths = []
    aligned_lengths = []
    region_misassemblies = []
    misassembled_contigs = dict()
    misassemblies_in_contigs = []

    region_struct_variations = find_all_sv(qconfig.bed)

    istranslocations_by_ref = dict()
    misassemblies_by_ref = defaultdict(list)
    for ref in ref_labels_by_chromosomes.values():
        istranslocations_by_ref[ref] = dict((key, 0) for key in ref_labels_by_chromosomes.values())

    # for counting SNPs and indels (both original (.all_snps) and corrected from local misassemblies)
    total_indels_info = IndelsInfo()

    unaligned_file = open(unaligned_fpath, 'w')
    unaligned_info_file = open(unaligned_info_fpath, 'w')
    unaligned_info_file.write('\t'.join(['Contig', 'Total_length', 'Unaligned_length', 'Unaligned_type', 'Unaligned_parts']) + '\n')
    for contig, seq in fastaparser.read_fasta(contigs_fpath):
        #Recording contig stats
        ctg_len = len(seq)
        ca_output.stdout_f.write('CONTIG: %s (%dbp)\n' % (contig, ctg_len))
        contig_type = 'unaligned'
        misassemblies_in_contigs.append(0)
        contigs_aligned_lengths.append(0)
        filtered_aligns = []
        if contig in aligns:
            filtered_aligns = [align for align in aligns[contig] if align.len2 >= qconfig.min_alignment]

        #Check if this contig aligned to the reference
        if filtered_aligns:
            contig_type = 'correct'
            #Sort aligns by aligned_length * identity - unaligned_length (as we do in BSS)
            sorted_aligns = sorted(filtered_aligns, key=lambda x: (score_single_align(x), x.len2), reverse=True)
            top_len = sorted_aligns[0].len2
            top_id = sorted_aligns[0].idy
            top_score = score_single_align(sorted_aligns[0])
            top_aligns = []
            ca_output.stdout_f.write('Best alignment score: %.1f (LEN: %d, IDY: %.2f), Total number of alignments: %d\n'
                                     % (top_score, top_len, top_id, len(sorted_aligns)))

            #Check that top hit captures most of the contig
            if top_len > ctg_len * epsilon or ctg_len - top_len < maxun:
                #Reset top aligns: aligns that share the same value of longest and highest identity
                top_aligns.append(sorted_aligns[0])
                sorted_aligns = sorted_aligns[1:]

                #Continue grabbing alignments while length and identity are identical
                #while sorted_aligns and top_len == sorted_aligns[0].len2 and top_id == sorted_aligns[0].idy:
                while sorted_aligns and (score_single_align(sorted_aligns[0]) >= qconfig.ambiguity_score * top_score):
                    top_aligns.append(sorted_aligns[0])
                    sorted_aligns = sorted_aligns[1:]

                #Mark other alignments as insignificant (former ambiguous)
                if sorted_aligns:
                    ca_output.stdout_f.write('\t\tSkipping these alignments as insignificant (option --ambiguity-score is set to "%s"):\n' % str(qconfig.ambiguity_score))
                    for align in sorted_aligns:
                        ca_output.stdout_f.write('\t\t\tSkipping alignment ' + str(align) + '\n')

                if len(top_aligns) == 1:
                    #There is only one top align, life is good
                    ca_output.stdout_f.write('\t\tOne align captures most of this contig: %s\n' % str(top_aligns[0]))
                    ca_output.icarus_out_f.write(top_aligns[0].icarus_report_str() + '\n')
                    ref_aligns.setdefault(top_aligns[0].ref, []).append(top_aligns[0])
                    ca_output.coords_filtered_f.write(top_aligns[0].coords_str() + '\n')
                    aligned_lengths.append(top_aligns[0].len2)
                    contigs_aligned_lengths[-1] = top_aligns[0].len2
                else:
                    #There is more than one top align
                    ca_output.stdout_f.write('\t\tThis contig has %d significant alignments. [An ambiguously mapped contig]\n' %
                                             len(top_aligns))

                    #Increment count of ambiguously mapped contigs and bases in them
                    ambiguous_contigs += 1
                    # we count only extra bases, so we shouldn't include bases in the first alignment
                    # if --ambiguity-usage is 'none', the number of extra bases will be negative!
                    ambiguous_contigs_len += ctg_len

                    # Alex: skip all alignments or count them as normal (just different aligns of one repeat). Depend on --allow-ambiguity option
                    if qconfig.ambiguity_usage == "none":
                        ambiguous_contigs_extra_bases -= top_aligns[0].len2
                        ca_output.stdout_f.write('\t\tSkipping these alignments (option --ambiguity-usage is set to "none"):\n')
                        for align in top_aligns:
                            ca_output.stdout_f.write('\t\t\tSkipping alignment ' + str(align) + '\n')
                    elif qconfig.ambiguity_usage == "one":
                        ambiguous_contigs_extra_bases += 0
                        ca_output.stdout_f.write('\t\tUsing only first of these alignment (option --ambiguity-usage is set to "one"):\n')
                        ca_output.stdout_f.write('\t\t\tAlignment: %s\n' % str(top_aligns[0]))
                        ca_output.icarus_out_f.write(top_aligns[0].icarus_report_str() + '\n')
                        ref_aligns.setdefault(top_aligns[0].ref, []).append(top_aligns[0])
                        aligned_lengths.append(top_aligns[0].len2)
                        contigs_aligned_lengths[-1] = top_aligns[0].len2
                        ca_output.coords_filtered_f.write(top_aligns[0].coords_str() + '\n')
                        top_aligns = top_aligns[1:]
                        for align in top_aligns:
                            ca_output.stdout_f.write('\t\t\tSkipping alignment ' + str(align) + '\n')
                    elif qconfig.ambiguity_usage == "all":
                        ambiguous_contigs_extra_bases -= top_aligns[0].len2
                        ca_output.stdout_f.write('\t\tUsing all these alignments (option --ambiguity-usage is set to "all"):\n')
                        # we count only extra bases, so we shouldn't include bases in the first alignment
                        first_alignment = True
                        contig_type = 'ambiguous'
                        while len(top_aligns):
                            ca_output.stdout_f.write('\t\t\tAlignment: %s\n' % str(top_aligns[0]))
                            ca_output.icarus_out_f.write(top_aligns[0].icarus_report_str(ambiguity=True) + '\n')
                            ref_aligns.setdefault(top_aligns[0].ref, []).append(top_aligns[0])
                            if first_alignment:
                                first_alignment = False
                                aligned_lengths.append(top_aligns[0].len2)
                                contigs_aligned_lengths[-1] = top_aligns[0].len2
                            ambiguous_contigs_extra_bases += top_aligns[0].len2
                            ca_output.coords_filtered_f.write(top_aligns[0].coords_str() + ' ambiguous\n')
                            top_aligns = top_aligns[1:]
            else:
                # choose appropriate alignments (to maximize total size of contig alignment and reduce # misassemblies)
                is_ambiguous, too_much_best_sets, sorted_aligns, best_sets = get_best_aligns_sets(
                    sorted_aligns, ctg_len, ca_output.stdout_f, seq, ref_lens, is_cyclic, region_struct_variations)
                the_best_set = best_sets[0]
                used_indexes = list(range(len(sorted_aligns)) if too_much_best_sets else get_used_indexes(best_sets))
                if len(used_indexes) < len(sorted_aligns):
                    ca_output.stdout_f.write('\t\t\tSkipping redundant alignments after choosing the best set of alignments\n')
                    for idx in set([idx for idx in range(len(sorted_aligns)) if idx not in used_indexes]):
                        ca_output.stdout_f.write('\t\tSkipping redundant alignment ' + str(sorted_aligns[idx]) + '\n')

                if is_ambiguous:
                    ca_output.stdout_f.write('\t\tThis contig has several significant sets of alignments. [An ambiguously mapped contig]\n')
                    # similar to regular ambiguous contigs, see above
                    ambiguous_contigs += 1
                    ambiguous_contigs_len += ctg_len

                    if qconfig.ambiguity_usage == "none":
                        ambiguous_contigs_extra_bases -= (ctg_len - the_best_set.uncovered)
                        ca_output.stdout_f.write('\t\tSkipping all alignments in these sets (option --ambiguity-usage is set to "none"):\n')
                        for idx in used_indexes:
                            ca_output.stdout_f.write('\t\t\tSkipping alignment ' + str(sorted_aligns[idx]) + '\n')
                        continue
                    elif qconfig.ambiguity_usage == "one":
                        ambiguous_contigs_extra_bases += 0
                        ca_output.stdout_f.write('\t\tUsing only the very best set (option --ambiguity-usage is set to "one").\n')
                        if len(the_best_set.indexes) < len(used_indexes):
                            ca_output.stdout_f.write('\t\tSo, skipping alignments from other sets:\n')
                            for idx in used_indexes:
                                if idx not in the_best_set.indexes:
                                    ca_output.stdout_f.write('\t\t\tSkipping alignment ' + str(sorted_aligns[idx]) + '\n')
                    elif qconfig.ambiguity_usage == "all":
                        ca_output.stdout_f.write('\t\tUsing all alignments in these sets (option --ambiguity-usage is set to "all"):\n')
                        ca_output.stdout_f.write('\t\t\tThe very best set is shown in details below, the rest are:\n')
                        for idx, cur_set in enumerate(best_sets[1:]):
                            ca_output.stdout_f.write('\t\t\t\tGroup #%d. Score: %.1f, number of alignments: %d, unaligned bases: %d\n' % \
                                (idx + 2, cur_set.score, len(cur_set.indexes), cur_set.uncovered))
                        if too_much_best_sets:
                            ca_output.stdout_f.write('\t\t\t\tetc...\n')
                        if len(the_best_set.indexes) < len(used_indexes):
                            ambiguous_contigs_extra_bases -= (ctg_len - the_best_set.uncovered)
                            ca_output.stdout_f.write('\t\t\tList of alignments used in the sets above:\n')
                            for idx in used_indexes:
                                align = sorted_aligns[idx]
                                ca_output.stdout_f.write('\t\tAlignment: %s\n' % str(align))
                                ref_aligns.setdefault(align.ref, []).append(align)
                                ambiguous_contigs_extra_bases += align.len2
                                ca_output.coords_filtered_f.write(align.coords_str() + " ambiguous\n")
                                if idx not in the_best_set.indexes:
                                    ca_output.icarus_out_f.write(align.icarus_report_str(is_best=False) + '\n')

                ca_output.stdout_f.write('\t\t\tThe best set is below. Score: %.1f, number of alignments: %d, unaligned bases: %d\n' % \
                                             (the_best_set.score, len(the_best_set.indexes), the_best_set.uncovered))
                real_aligns = [sorted_aligns[i] for i in the_best_set.indexes]

                # main processing part
                if len(real_aligns) == 1:
                    the_only_align = real_aligns[0]

                    #There is only one alignment of this contig to the reference
                    ca_output.coords_filtered_f.write(the_only_align.coords_str() + '\n')
                    aligned_lengths.append(the_only_align.len2)
                    contigs_aligned_lengths[-1] = the_only_align.len2

                    begin, end = the_only_align.start(), the_only_align.end()
                    unaligned_bases = (begin - 1) + (ctg_len - end)
                    number_unaligned_ns = seq[:begin - 1].count('N') + seq[end:].count('N')
                    aligned_bases_in_contig = ctg_len - unaligned_bases
                    acgt_ctg_len = ctg_len - seq.count('N')
                    is_partially_unaligned = check_partially_unaligned(seq, real_aligns, ctg_len)
                    if is_partially_unaligned:
                        partially_unaligned += 1
                        partially_unaligned_bases += unaligned_bases - number_unaligned_ns
                        if aligned_bases_in_contig < qconfig.unaligned_mis_threshold * acgt_ctg_len:
                            contig_type = 'correct_unaligned'
                        ca_output.stdout_f.write('\t\tThis contig is partially unaligned. '
                                                 '(Aligned %d out of %d non-N bases (%.2f%%))\n'
                                                 % (aligned_bases_in_contig, acgt_ctg_len,
                                                    100.0 * aligned_bases_in_contig / acgt_ctg_len))
                        save_unaligned_info(real_aligns, contig, ctg_len, unaligned_bases, unaligned_info_file)
                    ca_output.stdout_f.write('\t\tAlignment: %s\n' % str(the_only_align))
                    ca_output.icarus_out_f.write(the_only_align.icarus_report_str() + '\n')
                    if is_partially_unaligned:
                        if begin - 1:
                            ca_output.stdout_f.write('\t\tUnaligned bases: 1 to %d (%d)\n' % (begin - 1, begin - 1))
                        if ctg_len - end:
                            ca_output.stdout_f.write('\t\tUnaligned bases: %d to %d (%d)\n' % (end + 1, ctg_len, ctg_len - end))
                        if qconfig.is_combined_ref:
                            check_for_potential_translocation(seq, ctg_len, real_aligns, region_misassemblies,
                                                              misassemblies_by_ref, ca_output.stdout_f)
                    ref_aligns.setdefault(the_only_align.ref, []).append(the_only_align)
                else:
                    #Sort real alignments by position on the contig
                    sorted_aligns = sorted(real_aligns, key=lambda x: (x.end(), x.start()))

                    #There is more than one alignment of this contig to the reference
                    ca_output.stdout_f.write('\t\tThis contig is misassembled.\n')
                    unaligned_bases = the_best_set.uncovered
                    number_unaligned_ns, prev_pos = 0, 0
                    for align in sorted_aligns:
                        number_unaligned_ns += seq[prev_pos: align.start() - 1].count('N')
                        prev_pos = align.end()
                    number_unaligned_ns += seq[prev_pos:].count('N')

                    aligned_bases_in_contig = ctg_len - unaligned_bases
                    number_ns = seq.count('N')
                    acgt_ctg_len = ctg_len - number_ns
                    is_partially_unaligned = check_partially_unaligned(seq, sorted_aligns, ctg_len)
                    if is_partially_unaligned:
                        partially_unaligned += 1
                        partially_unaligned_bases += unaligned_bases - number_unaligned_ns
                        ca_output.stdout_f.write('\t\tThis contig is partially unaligned. '
                                                 '(Aligned %d out of %d non-N bases (%.2f%%))\n'
                                                 % (aligned_bases_in_contig, acgt_ctg_len,
                                                 100.0 * aligned_bases_in_contig / acgt_ctg_len))
                        save_unaligned_info(sorted_aligns, contig, ctg_len, unaligned_bases, unaligned_info_file)

                    if aligned_bases_in_contig < qconfig.unaligned_mis_threshold * acgt_ctg_len:
                        ca_output.stdout_f.write('\t\t\tWarning! This contig is more unaligned than misassembled. ' + \
                                                 'Contig length is %d (number of Ns: %d) and total length of all aligns is %d\n' %
                                                 (ctg_len, number_ns, aligned_bases_in_contig))
                        contigs_aligned_lengths[-1] = sum(align.len2 for align in sorted_aligns)
                        for align in sorted_aligns:
                            ca_output.stdout_f.write('\t\tAlignment: %s\n' % str(align))
                            ca_output.icarus_out_f.write(align.icarus_report_str() + '\n')
                            ca_output.icarus_out_f.write('unknown\n')
                            ca_output.coords_filtered_f.write(align.coords_str() + '\n')
                            aligned_lengths.append(align.len2)
                            ref_aligns.setdefault(align.ref, []).append(align)

                        half_unaligned_with_misassembly += 1
                        ca_output.stdout_f.write('\t\tUnaligned bases: %d\n' % unaligned_bases)
                        contig_type = 'mis_unaligned'
                        ca_output.icarus_out_f.write('\t'.join(['CONTIG', contig, str(ctg_len), contig_type + '\n']))
                        ca_output.stdout_f.write('\n')
                        continue

                    ### processing misassemblies
                    is_misassembled, current_mio, indels_info, cnt_misassemblies, contig_aligned_length = \
                        process_misassembled_contig(sorted_aligns, is_cyclic, aligned_lengths, region_misassemblies,
                                                    ref_lens, ref_aligns, ref_features, seq, misassemblies_by_ref,
                                                    istranslocations_by_ref, region_struct_variations, ca_output)
                    contigs_aligned_lengths[-1] = contig_aligned_length
                    misassembly_internal_overlap += current_mio
                    total_indels_info += indels_info
                    if is_misassembled:
                        misassembled_contigs[contig] = ctg_len
                        contig_type = 'misassembled'
                        misassemblies_in_contigs[-1] = cnt_misassemblies
                    if is_partially_unaligned:
                        ca_output.stdout_f.write('\t\tUnaligned bases: %d\n' % unaligned_bases)
                        if qconfig.is_combined_ref:
                            check_for_potential_translocation(seq, ctg_len, sorted_aligns, region_misassemblies,
                                                              misassemblies_by_ref, ca_output.stdout_f)
        else:
            #No aligns to this contig
            ca_output.stdout_f.write('\t\tThis contig is unaligned. (%d bp)\n' % ctg_len)
            unaligned_file.write(contig + '\n')

            #Increment unaligned contig count and bases
            unaligned += 1
            number_ns = seq.count('N')
            fully_unaligned_bases += ctg_len - number_ns
            ca_output.stdout_f.write('\t\tUnaligned bases: %d (number of Ns: %d)\n' % (ctg_len, number_ns))
            save_unaligned_info([], contig, ctg_len, ctg_len, unaligned_info_file)

        ca_output.icarus_out_f.write('\t'.join(['CONTIG', contig, str(ctg_len), contig_type]) + '\n')
        ca_output.stdout_f.write('\n')

    unaligned_file.close()
    unaligned_info_file.close()
    misassembled_bases = sum(misassembled_contigs.values())

    # special case: --skip-unaligned-mis-contigs is specified
    if qconfig.unaligned_mis_threshold == 0.0:
        half_unaligned_with_misassembly = None

    result = {'region_misassemblies': region_misassemblies,
              'region_struct_variations': region_struct_variations.get_count() if region_struct_variations else None,
              'misassembled_contigs': misassembled_contigs, 'misassembled_bases': misassembled_bases,
              'misassembly_internal_overlap': misassembly_internal_overlap,
              'unaligned': unaligned, 'partially_unaligned': partially_unaligned,
              'partially_unaligned_bases': partially_unaligned_bases, 'fully_unaligned_bases': fully_unaligned_bases,
              'ambiguous_contigs': ambiguous_contigs, 'ambiguous_contigs_extra_bases': ambiguous_contigs_extra_bases,
              'ambiguous_contigs_len': ambiguous_contigs_len,
              'half_unaligned_with_misassembly': half_unaligned_with_misassembly,
              'misassemblies_by_ref': misassemblies_by_ref,
              'istranslocations_by_refs': istranslocations_by_ref}

    return result, ref_aligns, total_indels_info, aligned_lengths, misassembled_contigs, misassemblies_in_contigs, contigs_aligned_lengths
Esempio n. 33
0
    return qutils.correct_name(name)
    # return re.sub(r'\W', '', re.sub(r'\s', '_', name))

# MAIN
if len(sys.argv) != 3:
    print("Usage: " + sys.argv[0] + " <input fasta> <contig id or file with list of contig ids>")
    sys.exit()

if os.path.isfile(sys.argv[2]):
    list_of_ids = []
    for line in open(sys.argv[2]):
        list_of_ids.append(line.strip())
else:
    list_of_ids = [sys.argv[2]]

origin_fasta = fastaparser.read_fasta(sys.argv[1])
dict_of_all_contigs = dict()
selected_contigs = []
for (name, seq) in origin_fasta:
    corr_name = get_corr_name(name)
    dict_of_all_contigs[corr_name] = seq

for name in list_of_ids:
    corr_name = get_corr_name(name)
    if corr_name in dict_of_all_contigs:
        selected_contigs.append((name, dict_of_all_contigs[corr_name]))
    else:
        print >> sys.stderr, "Contig", name, "(cor name:", corr_name, ") not found!"

for (name, seq) in selected_contigs:
    print '>' + name
Esempio n. 34
0
def do(output_dir, ref_fpath, contigs_fpaths, logger):
    logger.print_timestamp()
    logger.main_info('Running analysis based on unique ' + str(KMERS_LEN) + '-mers...')

    checked_assemblies = []
    for contigs_fpath in contigs_fpaths:
        label = qutils.label_from_fpath_for_fname(contigs_fpath)
        if check_kmc_successful_check(output_dir, contigs_fpath, contigs_fpaths, ref_fpath):
            kmc_stats_fpath = join(output_dir, label + '.stat')
            stats_content = open(kmc_stats_fpath).read().split('\n')
            if len(stats_content) < 1:
                continue
            logger.info('  Using existing results for ' + label + '... ')
            report = reporting.get(contigs_fpath)
            report.add_field(reporting.Fields.KMER_COMPLETENESS, '%.2f' % float(stats_content[0].strip().split(': ')[-1]))
            if len(stats_content) >= 5:
                len_map_to_one_chrom = int(stats_content[1].strip().split(': ')[-1])
                len_map_to_multi_chrom = int(stats_content[2].strip().split(': ')[-1])
                len_map_to_none_chrom = int(stats_content[3].strip().split(': ')[-1])
                total_len = int(stats_content[4].strip().split(': ')[-1])
                report.add_field(reporting.Fields.KMER_SCAFFOLDS_ONE_CHROM, '%.2f' % (len_map_to_one_chrom * 100.0 / total_len))
                report.add_field(reporting.Fields.KMER_SCAFFOLDS_MULTI_CHROM, '%.2f' % (len_map_to_multi_chrom * 100.0 / total_len))
                report.add_field(reporting.Fields.KMER_SCAFFOLDS_NONE_CHROM, '%.2f' % (len_map_to_none_chrom * 100.0 / total_len))
            checked_assemblies.append(contigs_fpath)

    contigs_fpaths = [fpath for fpath in contigs_fpaths if fpath not in checked_assemblies]
    if len(contigs_fpaths) == 0:
        logger.info('Done.')
        return

    if qconfig.platform_name == 'linux_32':
        logger.warning('  Sorry, can\'t run KMC on this platform, skipping...')
        return None

    kmc_dirpath = get_dir_for_download(kmc_dirname, 'KMC', ['kmc', 'kmc_tools'], logger)
    global kmc_bin_fpath
    global kmc_tools_fpath
    kmc_bin_fpath = download_external_tool('kmc', kmc_dirpath, 'KMC', platform_specific=True, is_executable=True)
    kmc_tools_fpath = download_external_tool('kmc_tools', kmc_dirpath, 'KMC', platform_specific=True, is_executable=True)
    if not exists(kmc_bin_fpath) or not exists(kmc_tools_fpath) or not compile_minimap(logger):
        logger.warning('  Sorry, can\'t run KMC, skipping...')
        return None

    logger.info('Running KMC on reference...')
    log_fpath = join(output_dir, 'kmc.log')
    err_fpath = join(output_dir, 'kmc.err')
    open(log_fpath, 'w').close()
    open(err_fpath, 'w').close()

    tmp_dirpath = join(output_dir, 'tmp')
    if not isdir(tmp_dirpath):
        os.makedirs(tmp_dirpath)
    ref_kmc_out_fpath = count_kmers(tmp_dirpath, ref_fpath, log_fpath, err_fpath)
    unique_kmers = get_kmers_cnt(tmp_dirpath, ref_kmc_out_fpath, log_fpath, err_fpath)
    if not unique_kmers:
        return

    logger.info('Analyzing assemblies completeness...')
    kmc_out_fpaths = []
    for contigs_fpath in contigs_fpaths:
        report = reporting.get(contigs_fpath)
        kmc_out_fpath = count_kmers(tmp_dirpath, contigs_fpath, log_fpath, err_fpath)
        intersect_out_fpath = intersect_kmers(tmp_dirpath, [ref_kmc_out_fpath, kmc_out_fpath], log_fpath, err_fpath)
        matched_kmers = get_kmers_cnt(tmp_dirpath, intersect_out_fpath, log_fpath, err_fpath)
        completeness = matched_kmers * 100.0 / unique_kmers
        report.add_field(reporting.Fields.KMER_COMPLETENESS, '%.2f' % completeness)
        kmc_out_fpaths.append(intersect_out_fpath)

    logger.info('Analyzing assemblies accuracy...')
    if len(kmc_out_fpaths) > 1:
        shared_kmc_db = intersect_kmers(tmp_dirpath, kmc_out_fpaths, log_fpath, err_fpath)
    else:
        shared_kmc_db = kmc_out_fpaths[0]

    kmer_fraction = 0.001

    ref_contigs = [name for name, _ in read_fasta(ref_fpath)]
    ref_kmc_dbs = []

    if len(ref_contigs) <= MAX_REF_CONTIGS_NUM:
        shared_downsampled_kmc_db = downsample_kmers(tmp_dirpath, ref_fpath, shared_kmc_db, log_fpath, err_fpath, kmer_fraction=kmer_fraction)
        for name, seq in read_fasta(ref_fpath):
            seq_kmc_db = seq_to_kmc_db(tmp_dirpath, log_fpath, err_fpath, seq=seq, name=name, is_ref=True,
                                                     intersect_with=shared_downsampled_kmc_db)
            ref_kmc_dbs.append((name, seq_kmc_db))

    for contigs_fpath in contigs_fpaths:
        report = reporting.get(contigs_fpath)
        len_map_to_one_chrom = None
        len_map_to_multi_chrom = None
        len_map_to_none_chrom = None
        total_len = 0
        long_contigs = []
        contig_lens = dict()
        contig_markers = defaultdict(list)
        label = qutils.label_from_fpath_for_fname(contigs_fpath)
        list_files_fpath = join(tmp_dirpath, label + '_files.txt')
        with open(list_files_fpath, 'w') as list_files:
            for name, seq in read_fasta(contigs_fpath):
                total_len += len(seq)
                contig_lens[name] = len(seq)
                if len(seq) >= MIN_CONTIGS_LEN:
                    long_contigs.append(len(seq))
                    tmp_contig_fpath = join(tmp_dirpath, name + '.fasta')
                    with open(tmp_contig_fpath, 'w') as out_f:
                        out_f.write('>%s\n' % name)
                        out_f.write('%s\n' % seq)
                    list_files.write(tmp_contig_fpath + '\n')

        if len(long_contigs) > MAX_CONTIGS_NUM or sum(long_contigs) < total_len * 0.5:
            logger.warning('Assembly is too fragmented. Scaffolding accuracy will not be assessed.')
        elif len(ref_contigs) > MAX_REF_CONTIGS_NUM:
            logger.warning('Reference is too fragmented. Scaffolding accuracy will not be assessed.')
        else:
            len_map_to_one_chrom = 0
            len_map_to_multi_chrom = 0
            filtered_fpath = join(tmp_dirpath, label + '.filtered.fasta')
            filter_contigs(list_files_fpath, filtered_fpath, shared_kmc_db, log_fpath, err_fpath, min_kmers=MIN_MARKERS)
            filtered_list_files_fpath = join(tmp_dirpath, label + '_files.filtered.txt')
            with open(filtered_list_files_fpath, 'w') as list_files:
                for name, _ in read_fasta(filtered_fpath):
                    tmp_contig_fpath = join(tmp_dirpath, name + '.fasta')
                    list_files.write(tmp_contig_fpath + '\n')
            for ref_name, ref_kmc_db in ref_kmc_dbs:
                tmp_filtered_fpath = join(tmp_dirpath, ref_name + '.filtered.fasta')
                filter_contigs(filtered_list_files_fpath, tmp_filtered_fpath, ref_kmc_db, log_fpath, err_fpath, min_kmers=MIN_MISJOIN_MARKERS)
                if exists(tmp_filtered_fpath):
                    for name, _ in read_fasta(tmp_filtered_fpath):
                        contig_markers[name].append(ref_name)
            for name, chr_markers in contig_markers.items():
                if len(chr_markers) == 1:
                    len_map_to_one_chrom += contig_lens[name]
                else:
                    len_map_to_multi_chrom += contig_lens[name]
            len_map_to_none_chrom = total_len - len_map_to_one_chrom - len_map_to_multi_chrom
            report.add_field(reporting.Fields.KMER_SCAFFOLDS_ONE_CHROM, '%.2f' % (len_map_to_one_chrom * 100.0 / total_len))
            report.add_field(reporting.Fields.KMER_SCAFFOLDS_MULTI_CHROM, '%.2f' % (len_map_to_multi_chrom * 100.0 / total_len))
            report.add_field(reporting.Fields.KMER_SCAFFOLDS_NONE_CHROM, '%.2f' % (len_map_to_none_chrom * 100.0 / total_len))

        create_kmc_stats_file(output_dir, contigs_fpath, contigs_fpaths, ref_fpath,
                             report.get_field(reporting.Fields.KMER_COMPLETENESS),
                             len_map_to_one_chrom, len_map_to_multi_chrom, len_map_to_none_chrom, total_len)

    if not qconfig.debug:
        shutil.rmtree(tmp_dirpath)
    logger.info('Done.')
Esempio n. 35
0
import sys
import os
sys.path.append(os.path.join(os.path.abspath(sys.path[0]), '../'))
import quast_libs
from quast_libs import fastaparser

if len(sys.argv) <= 3 or len(sys.argv) >= 6:
    print("Returns [reverse-complement] sequence from START to END position from each entry of input fasta")
    print("Usage: " + sys.argv[0] + " <input fasta> <START> <END, -1 for the end> [any string -- optional parameter for reverse-complement]")
    sys.exit()

inp=sys.argv[1]
start=int(sys.argv[2])
end=int(sys.argv[3])
reverse = False
if len(sys.argv) == 5:
  reverse = True

for tup in fastaparser.read_fasta(inp):
    cur_start = min(start, len(tup[1]))
    if end == -1:
        cur_end = len(tup[1])
    else:
        cur_end = min(end, len(tup[1]))    
    print (">" + tup[0] + "_cropped_" + str(cur_start) + "_" + str(cur_end))
    if reverse:
        print (fastaparser.rev_comp(tup[1][cur_start - 1 : cur_end]))
    else:
        print (tup[1][cur_start - 1 : cur_end])    

Esempio n. 36
0
def do(ref_fpath, contigs_fpaths, output_dirpath, results_dir):
    logger.print_timestamp()
    logger.main_info("Running Basic statistics processor...")
    
    if not os.path.isdir(output_dirpath):
        os.mkdir(output_dirpath)

    reference_length = None
    reference_lengths = []
    reference_fragments = None
    icarus_gc_fpath = None
    circos_gc_fpath = None
    if ref_fpath:
        reference_lengths = sorted(fastaparser.get_chr_lengths_from_fastafile(ref_fpath).values(), reverse=True)
        reference_fragments = len(reference_lengths)
        reference_length = sum(reference_lengths)
        reference_GC, reference_GC_distribution, reference_GC_contigs_distribution = GC_content(ref_fpath)
        if qconfig.create_icarus_html or qconfig.draw_plots:
            icarus_gc_fpath = join(output_dirpath, 'gc.icarus.txt')
            save_icarus_GC(ref_fpath, icarus_gc_fpath)
        if qconfig.draw_plots:
            circos_gc_fpath = join(output_dirpath, 'gc.circos.txt')
            save_circos_GC(ref_fpath, reference_length, circos_gc_fpath)

        logger.info('  Reference genome:')
        logger.info('    ' + os.path.basename(ref_fpath) + ', length = ' + str(reference_length) +
                    ', num fragments = ' + str(reference_fragments) + ', GC % = ' +
                    '%.2f' % reference_GC if reference_GC is not None else 'undefined')
        if reference_fragments > 30 and not qconfig.check_for_fragmented_ref:
            logger.warning('  Reference genome is fragmented. You may consider rerunning QUAST using --fragmented option.'
                           ' QUAST will try to detect misassemblies caused by the fragmentation and mark them fake (will be excluded from # misassemblies).')
    elif qconfig.estimated_reference_size:
        reference_length = qconfig.estimated_reference_size
        reference_lengths = [reference_length]
        logger.info('  Estimated reference length = ' + str(reference_length))

    logger.info('  Contig files: ')
    lists_of_lengths = []
    numbers_of_Ns = []
    coverage_dict = dict()
    cov_pattern = re.compile(r'_cov_(\d+\.?\d*)')
    for id, contigs_fpath in enumerate(contigs_fpaths):
        coverage_dict[contigs_fpath] = []
        assembly_label = qutils.label_from_fpath(contigs_fpath)

        logger.info('    ' + qutils.index_to_str(id) + assembly_label)
        # lists_of_lengths.append(fastaparser.get_lengths_from_fastafile(contigs_fpath))
        list_of_length = []
        number_of_Ns = 0
        is_potential_scaffold = False
        for (name, seq) in fastaparser.read_fasta(contigs_fpath):
            list_of_length.append(len(seq))
            number_of_Ns += seq.count('N')
            if not qconfig.scaffolds and not is_potential_scaffold and qutils.is_scaffold(seq):
                is_potential_scaffold = True
                qconfig.potential_scaffolds_assemblies.append(assembly_label)
            if cov_pattern.findall(name):
                cov = int(float(cov_pattern.findall(name)[0]))
                if len(coverage_dict[contigs_fpath]) <= cov:
                    coverage_dict[contigs_fpath] += [0] * (cov - len(coverage_dict[contigs_fpath]) + 1)
                coverage_dict[contigs_fpath][cov] += len(seq)

        lists_of_lengths.append(list_of_length)
        numbers_of_Ns.append(number_of_Ns)

    lists_of_lengths = [sorted(list, reverse=True) for list in lists_of_lengths]
    num_contigs = max([len(list_of_length) for list_of_length in lists_of_lengths])
    multiplicator = 1
    if num_contigs >= (qconfig.max_points * 2):
        import math
        multiplicator = int(num_contigs / qconfig.max_points)
        max_points = num_contigs // multiplicator
        corr_lists_of_lengths = [[sum(list_of_length[((i - 1) * multiplicator):(i * multiplicator)]) for i in range(1, max_points)
                                  if (i * multiplicator) < len(list_of_length)] for list_of_length in lists_of_lengths]
        if len(reference_lengths) > 1:
            reference_lengths = [sum(reference_lengths[((i - 1) * multiplicator):(i * multiplicator)])
                                 if (i * multiplicator) < len(reference_lengths) else
                                 sum(reference_lengths[((i - 1) * multiplicator):])
                                 for i in range(1, max_points)] + [sum(reference_lengths[(max_points - 1) * multiplicator:])]
        for num_list in range(len(corr_lists_of_lengths)):
            last_index = len(corr_lists_of_lengths[num_list])
            corr_lists_of_lengths[num_list].append(sum(lists_of_lengths[num_list][last_index * multiplicator:]))
    else:
        corr_lists_of_lengths = [sorted(list, reverse=True) for list in lists_of_lengths]

    if reference_lengths:
        # Saving for an HTML report
        if qconfig.html_report:
            from quast_libs.html_saver import html_saver
            html_saver.save_reference_lengths(results_dir, reference_lengths)

    if qconfig.html_report:
        from quast_libs.html_saver import html_saver
        html_saver.save_contigs_lengths(results_dir, contigs_fpaths, corr_lists_of_lengths)
        html_saver.save_tick_x(results_dir, multiplicator)

    ########################################################################

    logger.info('  Calculating N50 and L50...')

    list_of_GC_distributions = []
    list_of_GC_contigs_distributions = []
    largest_contig = 0
    from . import N50
    for id, (contigs_fpath, lengths_list, number_of_Ns) in enumerate(zip(contigs_fpaths, lists_of_lengths, numbers_of_Ns)):
        report = reporting.get(contigs_fpath)
        n50, l50 = N50.N50_and_L50(lengths_list)
        ng50, lg50 = None, None
        if reference_length:
            ng50, lg50 = N50.NG50_and_LG50(lengths_list, reference_length)
        n75, l75 = N50.N50_and_L50(lengths_list, 75)
        ng75, lg75 = None, None
        if reference_length:
            ng75, lg75 = N50.NG50_and_LG50(lengths_list, reference_length, 75)
        total_length = sum(lengths_list)
        total_GC, GC_distribution, GC_contigs_distribution = GC_content(contigs_fpath, skip=qconfig.no_gc)
        list_of_GC_distributions.append(GC_distribution)
        list_of_GC_contigs_distributions.append(GC_contigs_distribution)
        logger.info('    ' + qutils.index_to_str(id) +
                    qutils.label_from_fpath(contigs_fpath) + \
                    ', N50 = ' + str(n50) + \
                    ', L50 = ' + str(l50) + \
                    ', Total length = ' + str(total_length) + \
                    ', GC % = ' + ('%.2f' % total_GC if total_GC is not None else 'undefined') + \
                    ', # N\'s per 100 kbp = ' + ' %.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)) if total_length != 0 else 'undefined')
        
        report.add_field(reporting.Fields.N50, n50)
        report.add_field(reporting.Fields.L50, l50)
        if reference_length and not qconfig.is_combined_ref:
            report.add_field(reporting.Fields.NG50, ng50)
            report.add_field(reporting.Fields.LG50, lg50)
        report.add_field(reporting.Fields.N75, n75)
        report.add_field(reporting.Fields.L75, l75)
        if reference_length and not qconfig.is_combined_ref:
            report.add_field(reporting.Fields.NG75, ng75)
            report.add_field(reporting.Fields.LG75, lg75)
        report.add_field(reporting.Fields.CONTIGS, len(lengths_list))
        if lengths_list:
            report.add_field(reporting.Fields.LARGCONTIG, max(lengths_list))
            largest_contig = max(largest_contig, max(lengths_list))
            report.add_field(reporting.Fields.TOTALLEN, total_length)
            if not qconfig.is_combined_ref:
                report.add_field(reporting.Fields.GC, ('%.2f' % total_GC if total_GC is not None else None))
            report.add_field(reporting.Fields.UNCALLED, number_of_Ns)
            report.add_field(reporting.Fields.UNCALLED_PERCENT, ('%.2f' % (float(number_of_Ns) * 100000.0 / float(total_length))))
        if ref_fpath:
            report.add_field(reporting.Fields.REFLEN, int(reference_length))
            report.add_field(reporting.Fields.REF_FRAGMENTS, reference_fragments)
            if not qconfig.is_combined_ref:
                report.add_field(reporting.Fields.REFGC, ('%.2f' % reference_GC if reference_GC is not None else None))
        elif reference_length:
            report.add_field(reporting.Fields.ESTREFLEN, int(reference_length))

    import math
    qconfig.min_difference = math.ceil((largest_contig / 1000) / 600)  # divide on height of plot

    list_of_GC_distributions_with_ref = list_of_GC_distributions
    reference_index = None
    if ref_fpath:
        reference_index = len(list_of_GC_distributions_with_ref)
        list_of_GC_distributions_with_ref.append(reference_GC_distribution)

    if qconfig.html_report and not qconfig.no_gc:
        from quast_libs.html_saver import html_saver
        html_saver.save_GC_info(results_dir, contigs_fpaths, list_of_GC_distributions_with_ref, list_of_GC_contigs_distributions, reference_index)

    ########################################################################
    # Drawing Nx and NGx plots...
    plotter.Nx_plot(results_dir, num_contigs > qconfig.max_points, contigs_fpaths, lists_of_lengths, join(output_dirpath, 'Nx_plot'), 'Nx', [])
    if reference_length and not qconfig.is_combined_ref:
        plotter.Nx_plot(results_dir, num_contigs > qconfig.max_points, contigs_fpaths, lists_of_lengths, join(output_dirpath, 'NGx_plot'), 'NGx',
                        [reference_length for i in range(len(contigs_fpaths))])

    if qconfig.draw_plots:
        ########################################################################import plotter
        # Drawing cumulative plot...
        plotter.cumulative_plot(ref_fpath, contigs_fpaths, lists_of_lengths, join(output_dirpath, 'cumulative_plot'), 'Cumulative length')
        if not qconfig.no_gc:
            ########################################################################
            # Drawing GC content plot...
            plotter.GC_content_plot(ref_fpath, contigs_fpaths, list_of_GC_distributions_with_ref, join(output_dirpath, 'GC_content_plot'))
            for contigs_fpath, GC_distribution in zip(contigs_fpaths, list_of_GC_contigs_distributions):
                plotter.contigs_GC_content_plot(contigs_fpath, GC_distribution,
                                                join(output_dirpath, qutils.label_from_fpath(contigs_fpath) + '_GC_content_plot'))

        if any(coverage_dict[contigs_fpath] for contigs_fpath in contigs_fpaths):
            draw_coverage_histograms(coverage_dict, contigs_fpaths, output_dirpath)

    logger.main_info('Done.')
    return icarus_gc_fpath, circos_gc_fpath
Esempio n. 37
0
def align_and_analyze(is_cyclic, index, contigs_fpath, output_dirpath, ref_fpath,
                      old_contigs_fpath, bed_fpath, parallel_by_chr=False, threads=1):
    nucmer_output_dirpath = create_nucmer_output_dir(output_dirpath)
    assembly_label = qutils.label_from_fpath(contigs_fpath)
    corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath)
    nucmer_fpath = join(nucmer_output_dirpath, corr_assembly_label)

    logger.info('  ' + qutils.index_to_str(index) + assembly_label)

    if not qconfig.space_efficient:
        log_out_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stdout')
        log_err_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stderr')
        icarus_out_fpath = join(output_dirpath, qconfig.icarus_report_fname_pattern % corr_assembly_label)
        misassembly_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.mis_contigs.info')
        unaligned_info_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.unaligned.info')
    else:
        log_out_fpath = '/dev/null'
        log_err_fpath = '/dev/null'
        icarus_out_fpath = '/dev/null'
        misassembly_fpath = '/dev/null'
        unaligned_info_fpath = '/dev/null'

    icarus_out_f = open(icarus_out_fpath, 'w')
    icarus_header_cols = ['S1', 'E1', 'S2', 'E2', 'Reference', 'Contig', 'IDY', 'Ambiguous', 'Best_group']
    icarus_out_f.write('\t'.join(icarus_header_cols) + '\n')
    misassembly_f = open(misassembly_fpath, 'w')

    if not qconfig.space_efficient:
        logger.info('  ' + qutils.index_to_str(index) + 'Logging to files ' + log_out_fpath +
                ' and ' + os.path.basename(log_err_fpath) + '...')
    else:
        logger.info('  ' + qutils.index_to_str(index) + 'Logging is disabled.')

    coords_fpath, coords_filtered_fpath, unaligned_fpath, show_snps_fpath, used_snps_fpath = \
        get_nucmer_aux_out_fpaths(nucmer_fpath)

    nucmer_status = align_contigs(nucmer_fpath, ref_fpath, contigs_fpath, old_contigs_fpath, index,
                                  parallel_by_chr, threads, log_out_fpath, log_err_fpath)
    if nucmer_status != NucmerStatus.OK:
        with open(log_err_fpath, 'a') as log_err_f:
            if nucmer_status == NucmerStatus.ERROR:
                logger.error('  ' + qutils.index_to_str(index) +
                         'Failed aligning contigs ' + qutils.label_from_fpath(contigs_fpath) +
                         ' to the reference (non-zero exit code). ' +
                         ('Run with the --debug flag to see additional information.' if not qconfig.debug else ''))
            elif nucmer_status == NucmerStatus.FAILED:
                log_err_f.write(qutils.index_to_str(index) + 'Alignment failed for ' + contigs_fpath + ':' + coords_fpath + 'doesn\'t exist.\n')
                logger.info('  ' + qutils.index_to_str(index) + 'Alignment failed for ' + '\'' + assembly_label + '\'.')
            elif nucmer_status == NucmerStatus.NOT_ALIGNED:
                log_err_f.write(qutils.index_to_str(index) + 'Nothing aligned for ' + contigs_fpath + '\n')
                logger.info('  ' + qutils.index_to_str(index) + 'Nothing aligned for ' + '\'' + assembly_label + '\'.')
        clean_tmp_files(nucmer_fpath)
        return nucmer_status, {}, [], [], []

    log_out_f = open(log_out_fpath, 'a')
    # Loading the alignment files
    log_out_f.write('Parsing coords...\n')
    aligns = {}
    coords_file = open(coords_fpath)
    coords_filtered_file = open(coords_filtered_fpath, 'w')
    coords_filtered_file.write(coords_file.readline())
    coords_filtered_file.write(coords_file.readline())
    for line in coords_file:
        if line.strip() == '':
            break
        assert line[0] != '='
        #Clear leading spaces from nucmer output
        #Store nucmer lines in an array
        mapping = Mapping.from_line(line)
        aligns.setdefault(mapping.contig, []).append(mapping)

    # Loading the reference sequences
    log_out_f.write('Loading reference...\n') # TODO: move up
    ref_lens = {}
    ref_features = {}
    for name, seq in fastaparser.read_fasta(ref_fpath):
        name = name.split()[0]  # no spaces in reference header
        ref_lens[name] = len(seq)
        log_out_f.write('\tLoaded [%s]\n' % name)

    #Loading the SNP calls
    if qconfig.show_snps:
        log_out_f.write('Loading SNPs...\n')

    used_snps_file = None
    snps = {}
    if qconfig.show_snps:
        prev_line = None
        for line in open_gzipsafe(show_snps_fpath):
            #print "$line";
            line = line.split()
            if not line[0].isdigit():
                continue
            if prev_line and line == prev_line:
                continue
            ref = line[10]
            ctg = line[11]
            pos = int(line[0]) # Kolya: python don't convert int<->str types automatically
            loc = int(line[3]) # Kolya: same as above

            # if (! exists $line[11]) { die "Malformed line in SNP file.  Please check that show-snps has completed succesfully.\n$line\n[$line[9]][$line[10]][$line[11]]\n"; }
            if pos in snps.setdefault(ref, {}).setdefault(ctg, {}):
                snps.setdefault(ref, {}).setdefault(ctg, {})[pos].append(SNP(ref_pos=pos, ctg_pos=loc, ref_nucl=line[1], ctg_nucl=line[2]))
            else:
                snps.setdefault(ref, {}).setdefault(ctg, {})[pos] = [SNP(ref_pos=pos, ctg_pos=loc, ref_nucl=line[1], ctg_nucl=line[2])]
            prev_line = line
        used_snps_file = open_gzipsafe(used_snps_fpath, 'w')

    # Loading the regions (if any)
    regions = {}
    total_reg_len = 0
    total_regions = 0
    # # TODO: gff
    # log_out_f.write('Loading regions...\n')
    # log_out_f.write('\tNo regions given, using whole reference.\n')
    for name, seq_len in ref_lens.items():
        regions.setdefault(name, []).append([1, seq_len])
        total_regions += 1
        total_reg_len += seq_len
    log_out_f.write('\tTotal Regions: %d\n' % total_regions)
    log_out_f.write('\tTotal Region Length: %d\n' % total_reg_len)

    ca_output = CAOutput(stdout_f=log_out_f, misassembly_f=misassembly_f, coords_filtered_f=coords_filtered_file,
                         used_snps_f=used_snps_file, icarus_out_f=icarus_out_f)

    log_out_f.write('Analyzing contigs...\n')
    result, ref_aligns, total_indels_info, aligned_lengths, misassembled_contigs, misassemblies_in_contigs, aligned_lengths_by_contigs =\
        analyze_contigs(ca_output, contigs_fpath, unaligned_fpath, unaligned_info_fpath, aligns, ref_features, ref_lens, is_cyclic)

    # if qconfig.large_genome:
    #     log_out_f.write('Analyzing large blocks...\n')
    #     large_misassembly_fpath = add_suffix(misassembly_fpath, 'large_blocks') if not qconfig.space_efficient else '/dev/null'
    #     ca_large_output = CAOutput(stdout_f=log_out_f, misassembly_f=open(large_misassembly_fpath, 'w'),
    #                                coords_filtered_f=coords_filtered_file, used_snps_f=open('/dev/null', 'w'), icarus_out_f=open('/dev/null', 'w'))
    #     min_alignment, extensive_mis_threshold = qconfig.min_alignment, qconfig.extensive_misassembly_threshold
    #     qconfig.min_alignment, qconfig.extensive_misassembly_threshold = qconfig.LARGE_MIN_ALIGNMENT, qconfig.LARGE_EXTENSIVE_MIS_THRESHOLD
    #     result.update(analyze_contigs(ca_large_output, contigs_fpath, '/dev/null', '/dev/null',
    #                                   aligns, ref_features, ref_lens, is_cyclic, large_misassemblies_search=True)[0])
    #     qconfig.min_alignment, qconfig.extensive_misassembly_threshold = min_alignment, extensive_mis_threshold

    log_out_f.write('Analyzing coverage...\n')
    if qconfig.show_snps:
        log_out_f.write('Writing SNPs into ' + used_snps_fpath + '\n')
    result.update(analyze_coverage(ca_output, regions, ref_aligns, ref_features, snps, total_indels_info))
    result = print_results(contigs_fpath, log_out_f, used_snps_fpath, total_indels_info, result)

    if not qconfig.space_efficient:
        ## outputting misassembled contigs to separate file
        fasta = [(name, seq) for name, seq in fastaparser.read_fasta(contigs_fpath)
                 if name in misassembled_contigs.keys()]
        fastaparser.write_fasta(join(output_dirpath, qutils.name_from_fpath(contigs_fpath) + '.mis_contigs.fa'), fasta)

    if qconfig.is_combined_ref:
        alignment_tsv_fpath = join(output_dirpath, "alignments_" + corr_assembly_label + '.tsv')
        unique_contigs_fpath = join(output_dirpath, qconfig.unique_contigs_fname_pattern % corr_assembly_label)
        logger.debug('  ' + qutils.index_to_str(index) + 'Alignments: ' + qutils.relpath(alignment_tsv_fpath))
        used_contigs = set()
        with open(unique_contigs_fpath, 'w') as unique_contigs_f:
            with open(alignment_tsv_fpath, 'w') as alignment_tsv_f:
                for chr_name, aligns in ref_aligns.items():
                    alignment_tsv_f.write(chr_name)
                    contigs = set([align.contig for align in aligns])
                    for contig in contigs:
                        alignment_tsv_f.write('\t' + contig)

                    if qconfig.is_combined_ref:
                        ref_name = ref_labels_by_chromosomes[chr_name]
                        align_by_contigs = defaultdict(int)
                        for align in aligns:
                            align_by_contigs[align.contig] += align.len2
                        for contig, aligned_len in align_by_contigs.items():
                            if contig in used_contigs:
                                continue
                            used_contigs.add(contig)
                            len_cov_pattern = re.compile(r'_length_([\d\.]+)_cov_([\d\.]+)')
                            if len_cov_pattern.findall(contig):
                                contig_len = len_cov_pattern.findall(contig)[0][0]
                                contig_cov = len_cov_pattern.findall(contig)[0][1]
                                if aligned_len / float(contig_len) > 0.9:
                                    unique_contigs_f.write(ref_name + '\t' + str(aligned_len) + '\t' + contig_cov + '\n')
                    alignment_tsv_f.write('\n')

    close_handlers(ca_output)
    logger.info('  ' + qutils.index_to_str(index) + 'Analysis is finished.')
    logger.debug('')
    clean_tmp_files(nucmer_fpath)
    if not qconfig.no_gzip:
        compress_nucmer_output(logger, nucmer_fpath)
    if not ref_aligns:
        return NucmerStatus.NOT_ALIGNED, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs
    else:
        return NucmerStatus.OK, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs
Esempio n. 38
0
def do(contigs_fpaths,
       contig_report_fpath_pattern,
       output_dirpath,
       ref_fpath,
       cov_fpath=None,
       physical_cov_fpath=None,
       gc_fpath=None,
       stdout_pattern=None,
       find_similar=True,
       features=None,
       json_output_dir=None,
       genes_by_labels=None):
    make_output_dir(output_dirpath)

    lists_of_aligned_blocks = []
    contigs_by_assemblies = OrderedDict()
    structures_by_labels = {}
    ambiguity_alignments_by_labels = {}

    total_genome_size = 0
    reference_chromosomes = OrderedDict()
    contig_names_by_refs = None
    assemblies = None
    chr_names = []
    features_data = None

    plot_fpath = None

    if ref_fpath:
        for name, seq in fastaparser.read_fasta(ref_fpath):
            chr_name = name.split()[0]
            chr_names.append(chr_name)
            chr_len = len(seq)
            total_genome_size += chr_len
            reference_chromosomes[chr_name] = chr_len
        virtual_genome_shift = 100
        sorted_ref_names = sorted(reference_chromosomes,
                                  key=reference_chromosomes.get,
                                  reverse=True)
        sorted_ref_lengths = sorted(reference_chromosomes.values(),
                                    reverse=True)
        cumulative_ref_lengths = [0]
        if ref_labels_by_chromosomes:
            contig_names_by_refs = ref_labels_by_chromosomes
        elif sum(reference_chromosomes.values()
                 ) > qconfig.MAX_SIZE_FOR_COMB_PLOT:
            contig_names_by_refs = dict()
            if len(chr_names) > qconfig.ICARUS_MAX_CHROMOSOMES:
                summary_len = 0
                num_parts = 1
                html_name = qconfig.alignment_viewer_part_name + str(num_parts)
                for chr_name, chr_len in reference_chromosomes.items():
                    summary_len += chr_len
                    contig_names_by_refs[chr_name] = html_name
                    if summary_len >= qconfig.MAX_SIZE_FOR_COMB_PLOT:
                        summary_len = 0
                        num_parts += 1
                        html_name = qconfig.alignment_viewer_part_name + str(
                            num_parts)
            else:
                for chr_name in chr_names:
                    contig_names_by_refs[chr_name] = chr_name

        for i, chr in enumerate(chr_names):
            chr_length = reference_chromosomes[chr]
            len_to_append = cumulative_ref_lengths[-1] + chr_length
            if contig_names_by_refs:
                if i < len(chr_names) - 1 and contig_names_by_refs[
                        chr] != contig_names_by_refs[chr_names[i + 1]]:
                    len_to_append = 0
            cumulative_ref_lengths.append(len_to_append)
        virtual_genome_size = sum(reference_chromosomes.values(
        )) + virtual_genome_shift * (len(reference_chromosomes.values()) - 1)

    for contigs_fpath in contigs_fpaths:
        label = qconfig.assembly_labels_by_fpath[contigs_fpath]
        if not contig_report_fpath_pattern:
            contigs = parse_contigs_fpath(contigs_fpath)
        else:
            report_fpath = contig_report_fpath_pattern % qutils.label_from_fpath_for_fname(
                contigs_fpath)
            aligned_blocks, misassembled_id_to_structure, contigs, ambiguity_alignments = parse_aligner_contig_report(
                report_fpath, list(reference_chromosomes.keys()),
                cumulative_ref_lengths)
            if not contigs:
                contigs = parse_contigs_fpath(contigs_fpath)
            if aligned_blocks is None:
                return None
            for block in aligned_blocks:
                block.label = label
            aligned_blocks = check_misassembled_blocks(
                aligned_blocks, misassembled_id_to_structure)
            lists_of_aligned_blocks.append(aligned_blocks)
            structures_by_labels[label] = misassembled_id_to_structure
            if qconfig.ambiguity_usage == 'all':
                ambiguity_alignments_by_labels[label] = ambiguity_alignments
        contigs_by_assemblies[label] = contigs

    if ref_fpath:
        features_data = parse_features_data(features, cumulative_ref_lengths,
                                            chr_names)
    if contigs_fpaths and qconfig.gene_finding:
        parse_genes_data(contigs_by_assemblies, genes_by_labels)
    if reference_chromosomes and lists_of_aligned_blocks:
        assemblies = get_assemblies(contigs_fpaths, lists_of_aligned_blocks,
                                    virtual_genome_size, find_similar)
        if qconfig.draw_svg:
            plot_fpath = draw_alignment_plot(assemblies, virtual_genome_size,
                                             output_dirpath, sorted_ref_names,
                                             sorted_ref_lengths,
                                             virtual_genome_shift)
    if (assemblies or contigs_by_assemblies) and qconfig.create_icarus_html:
        icarus_html_fpath = js_data_gen(
            assemblies,
            contigs_fpaths,
            reference_chromosomes,
            output_dirpath,
            structures_by_labels,
            contig_names_by_refs=contig_names_by_refs,
            ref_fpath=ref_fpath,
            stdout_pattern=stdout_pattern,
            ambiguity_alignments_by_labels=ambiguity_alignments_by_labels,
            contigs_by_assemblies=contigs_by_assemblies,
            features_data=features_data,
            gc_fpath=gc_fpath,
            cov_fpath=cov_fpath,
            physical_cov_fpath=physical_cov_fpath,
            json_output_dir=json_output_dir)
    else:
        icarus_html_fpath = None

    return icarus_html_fpath, plot_fpath
Esempio n. 39
0
def do(contigs_fpaths, contig_report_fpath_pattern, output_dirpath, ref_fpath, cov_fpath=None,  physical_cov_fpath=None,
       stdout_pattern=None, find_similar=True, features=None, json_output_dir=None, genes_by_labels=None):
    make_output_dir(output_dirpath)

    lists_of_aligned_blocks = []
    contigs_by_assemblies = OrderedDict()
    structures_by_labels = {}
    ambiguity_alignments_by_labels = {}

    total_genome_size = 0
    reference_chromosomes = OrderedDict()
    contig_names_by_refs = None
    assemblies = None
    chr_names = []
    features_data = None

    plot_fpath = None
    max_small_chromosomes = 10

    if ref_fpath:
        for name, seq in fastaparser.read_fasta(ref_fpath):
            chr_name = name.split()[0]
            chr_names.append(chr_name)
            chr_len = len(seq)
            total_genome_size += chr_len
            reference_chromosomes[chr_name] = chr_len
        virtual_genome_shift = 100
        sorted_ref_names = sorted(reference_chromosomes, key=reference_chromosomes.get, reverse=True)
        sorted_ref_lengths = sorted(reference_chromosomes.values(), reverse=True)
        cumulative_ref_lengths = [0]
        if ref_labels_by_chromosomes:
            contig_names_by_refs = ref_labels_by_chromosomes
        elif sum(reference_chromosomes.values()) > qconfig.MAX_SIZE_FOR_COMB_PLOT:
            contig_names_by_refs = dict()
            if len(chr_names) > max_small_chromosomes:
                summary_len = 0
                num_parts = 1
                html_name = qconfig.alignment_viewer_part_name + str(num_parts)
                for chr_name, chr_len in reference_chromosomes.items():
                    summary_len += chr_len
                    contig_names_by_refs[chr_name] = html_name
                    if summary_len >= qconfig.MAX_SIZE_FOR_COMB_PLOT:
                        summary_len = 0
                        num_parts += 1
                        html_name = qconfig.alignment_viewer_part_name + str(num_parts)
            else:
                for chr_name in chr_names:
                    contig_names_by_refs[chr_name] = chr_name

        for i, chr in enumerate(chr_names):
            chr_length = reference_chromosomes[chr]
            len_to_append = cumulative_ref_lengths[-1] + chr_length
            if contig_names_by_refs:
                if i < len(chr_names) - 1 and contig_names_by_refs[chr] != contig_names_by_refs[chr_names[i + 1]]:
                    len_to_append = 0
            cumulative_ref_lengths.append(len_to_append)
        virtual_genome_size = sum(reference_chromosomes.values()) + virtual_genome_shift * (len(reference_chromosomes.values()) - 1)

    for contigs_fpath in contigs_fpaths:
        label = qconfig.assembly_labels_by_fpath[contigs_fpath]
        if not contig_report_fpath_pattern:
            contigs = parse_contigs_fpath(contigs_fpath)
        else:
            report_fpath = contig_report_fpath_pattern % qutils.label_from_fpath_for_fname(contigs_fpath)
            aligned_blocks, misassembled_id_to_structure, contigs, ambiguity_alignments = parse_nucmer_contig_report(report_fpath,
                                                                        list(reference_chromosomes.keys()), cumulative_ref_lengths)
            if not contigs:
                contigs = parse_contigs_fpath(contigs_fpath)
            if aligned_blocks is None:
                return None
            for block in aligned_blocks:
                block.label = label
            aligned_blocks = check_misassembled_blocks(aligned_blocks, misassembled_id_to_structure)
            lists_of_aligned_blocks.append(aligned_blocks)
            structures_by_labels[label] = misassembled_id_to_structure
            if qconfig.ambiguity_usage == 'all':
                ambiguity_alignments_by_labels[label] = ambiguity_alignments
        contigs_by_assemblies[label] = contigs

    if contigs_fpaths and ref_fpath and features:
        features_data = parse_features_data(features, cumulative_ref_lengths, chr_names)
    if contigs_fpaths and qconfig.gene_finding:
        parse_genes_data(contigs_by_assemblies, genes_by_labels)
    if reference_chromosomes and lists_of_aligned_blocks:
        assemblies = get_assemblies(contigs_fpaths, virtual_genome_size, lists_of_aligned_blocks, find_similar)
        if qconfig.draw_svg:
            plot_fpath = draw_alignment_plot(assemblies, virtual_genome_size, output_dirpath, sorted_ref_names, sorted_ref_lengths, virtual_genome_shift)
    if (assemblies or contigs_by_assemblies) and qconfig.create_icarus_html:
        icarus_html_fpath = js_data_gen(assemblies, contigs_fpaths, reference_chromosomes,
                    output_dirpath, structures_by_labels, contig_names_by_refs=contig_names_by_refs, ref_fpath=ref_fpath, stdout_pattern=stdout_pattern,
                    ambiguity_alignments_by_labels=ambiguity_alignments_by_labels, contigs_by_assemblies=contigs_by_assemblies,
                    features_data=features_data, cov_fpath=cov_fpath, physical_cov_fpath=physical_cov_fpath, json_output_dir=json_output_dir)
    else:
        icarus_html_fpath = None

    return icarus_html_fpath, plot_fpath
Esempio n. 40
0
def analyze_contigs(ca_output,
                    contigs_fpath,
                    unaligned_fpath,
                    unaligned_info_fpath,
                    aligns,
                    ref_features,
                    ref_lens,
                    is_cyclic=None):
    maxun = 10
    epsilon = 0.99
    umt = 0.5  # threshold for misassembled contigs with aligned less than $umt * 100% (Unaligned Missassembled Threshold)

    unaligned = 0
    partially_unaligned = 0
    fully_unaligned_bases = 0
    partially_unaligned_bases = 0
    ambiguous_contigs = 0
    ambiguous_contigs_extra_bases = 0
    ambiguous_contigs_len = 0
    half_unaligned_with_misassembly = 0
    misassembly_internal_overlap = 0
    misassemblies_matched_sv = 0

    ref_aligns = dict()
    contigs_aligned_lengths = []
    aligned_lengths = []
    region_misassemblies = []
    misassembled_contigs = dict()
    misassemblies_in_contigs = []

    region_struct_variations = find_all_sv(qconfig.bed)

    istranslocations_by_ref = dict()
    misassemblies_by_ref = defaultdict(list)
    for ref in ref_labels_by_chromosomes.values():
        istranslocations_by_ref[ref] = dict(
            (key, 0) for key in ref_labels_by_chromosomes.values())

    # for counting SNPs and indels (both original (.all_snps) and corrected from Nucmer's local misassemblies)
    total_indels_info = IndelsInfo()

    unaligned_file = open(unaligned_fpath, 'w')
    unaligned_info_file = open(unaligned_info_fpath, 'w')
    unaligned_info_file.write('\t'.join([
        'Contig', 'Total_length', 'Unaligned_length', 'Unaligned_type',
        'Unaligned_parts'
    ]) + '\n')
    for contig, seq in fastaparser.read_fasta(contigs_fpath):
        #Recording contig stats
        ctg_len = len(seq)
        ca_output.stdout_f.write('CONTIG: %s (%dbp)\n' % (contig, ctg_len))
        contig_type = 'unaligned'
        misassemblies_in_contigs.append(0)
        contigs_aligned_lengths.append(0)

        #Check if this contig aligned to the reference
        if contig in aligns:
            for align in aligns[contig]:
                sub_seq = seq[align.start():align.end()]
                if 'N' in sub_seq:
                    ns_pos = [
                        pos for pos in range(align.start(), align.end())
                        if seq[pos] == 'N'
                    ]
            contig_type = 'correct'
            #Pull all aligns for this contig
            num_aligns = len(aligns[contig])

            #Sort aligns by aligned_length * identity - unaligned_length (as we do in BSS)
            sorted_aligns = sorted(aligns[contig],
                                   key=lambda x:
                                   (score_single_align(x), x.len2),
                                   reverse=True)
            top_len = sorted_aligns[0].len2
            top_id = sorted_aligns[0].idy
            top_score = score_single_align(sorted_aligns[0])
            top_aligns = []
            ca_output.stdout_f.write(
                'Best alignment score: %.1f (LEN: %d, IDY: %.2f)\n' %
                (top_score, top_len, top_id))

            #Check that top hit captures most of the contig
            if top_len > ctg_len * epsilon or ctg_len - top_len < maxun:
                #Reset top aligns: aligns that share the same value of longest and highest identity
                top_aligns.append(sorted_aligns[0])
                sorted_aligns = sorted_aligns[1:]

                #Continue grabbing alignments while length and identity are identical
                #while sorted_aligns and top_len == sorted_aligns[0].len2 and top_id == sorted_aligns[0].idy:
                while sorted_aligns and (score_single_align(
                        sorted_aligns[0]) >=
                                         qconfig.ambiguity_score * top_score):
                    top_aligns.append(sorted_aligns[0])
                    sorted_aligns = sorted_aligns[1:]

                #Mark other alignments as insignificant (former ambiguous)
                if sorted_aligns:
                    ca_output.stdout_f.write(
                        '\t\tSkipping these alignments as insignificant (option --ambiguity-score is set to "%s"):\n'
                        % str(qconfig.ambiguity_score))
                    for align in sorted_aligns:
                        ca_output.stdout_f.write('\t\t\tSkipping alignment ' +
                                                 str(align) + '\n')

                if len(top_aligns) == 1:
                    #There is only one top align, life is good
                    ca_output.stdout_f.write(
                        '\t\tOne align captures most of this contig: %s\n' %
                        str(top_aligns[0]))
                    ca_output.icarus_out_f.write(
                        top_aligns[0].icarus_report_str() + '\n')
                    ref_aligns.setdefault(top_aligns[0].ref,
                                          []).append(top_aligns[0])
                    ca_output.coords_filtered_f.write(
                        str(top_aligns[0]) + '\n')
                    aligned_lengths.append(top_aligns[0].len2)
                    contigs_aligned_lengths[-1] = top_aligns[0].len2
                else:
                    #There is more than one top align
                    ca_output.stdout_f.write(
                        '\t\tThis contig has %d significant alignments. [An ambiguously mapped contig]\n'
                        % len(top_aligns))

                    #Increment count of ambiguously mapped contigs and bases in them
                    ambiguous_contigs += 1
                    # we count only extra bases, so we shouldn't include bases in the first alignment
                    # if --ambiguity-usage is 'none', the number of extra bases will be negative!
                    ambiguous_contigs_len += ctg_len

                    # Alex: skip all alignments or count them as normal (just different aligns of one repeat). Depend on --allow-ambiguity option
                    if qconfig.ambiguity_usage == "none":
                        ambiguous_contigs_extra_bases -= top_aligns[0].len2
                        ca_output.stdout_f.write(
                            '\t\tSkipping these alignments (option --ambiguity-usage is set to "none"):\n'
                        )
                        for align in top_aligns:
                            ca_output.stdout_f.write(
                                '\t\t\tSkipping alignment ' + str(align) +
                                '\n')
                    elif qconfig.ambiguity_usage == "one":
                        ambiguous_contigs_extra_bases += 0
                        ca_output.stdout_f.write(
                            '\t\tUsing only first of these alignment (option --ambiguity-usage is set to "one"):\n'
                        )
                        ca_output.stdout_f.write('\t\t\tAlignment: %s\n' %
                                                 str(top_aligns[0]))
                        ca_output.icarus_out_f.write(
                            top_aligns[0].icarus_report_str() + '\n')
                        ref_aligns.setdefault(top_aligns[0].ref,
                                              []).append(top_aligns[0])
                        aligned_lengths.append(top_aligns[0].len2)
                        contigs_aligned_lengths[-1] = top_aligns[0].len2
                        ca_output.coords_filtered_f.write(
                            str(top_aligns[0]) + '\n')
                        top_aligns = top_aligns[1:]
                        for align in top_aligns:
                            ca_output.stdout_f.write(
                                '\t\t\tSkipping alignment ' + str(align) +
                                '\n')
                    elif qconfig.ambiguity_usage == "all":
                        ambiguous_contigs_extra_bases -= top_aligns[0].len2
                        ca_output.stdout_f.write(
                            '\t\tUsing all these alignments (option --ambiguity-usage is set to "all"):\n'
                        )
                        # we count only extra bases, so we shouldn't include bases in the first alignment
                        first_alignment = True
                        contig_type = 'ambiguous'
                        while len(top_aligns):
                            ca_output.stdout_f.write('\t\t\tAlignment: %s\n' %
                                                     str(top_aligns[0]))
                            ca_output.icarus_out_f.write(
                                top_aligns[0].icarus_report_str(
                                    ambiguity=True) + '\n')
                            ref_aligns.setdefault(top_aligns[0].ref,
                                                  []).append(top_aligns[0])
                            if first_alignment:
                                first_alignment = False
                                aligned_lengths.append(top_aligns[0].len2)
                                contigs_aligned_lengths[-1] = top_aligns[
                                    0].len2
                            ambiguous_contigs_extra_bases += top_aligns[0].len2
                            ca_output.coords_filtered_f.write(
                                str(top_aligns[0]) + ' ambiguous\n')
                            top_aligns = top_aligns[1:]
            else:
                # choose appropriate alignments (to maximize total size of contig alignment and reduce # misassemblies)
                is_ambiguous, too_much_best_sets, sorted_aligns, best_sets = get_best_aligns_sets(
                    sorted_aligns, ctg_len, ca_output.stdout_f, seq, ref_lens,
                    is_cyclic, region_struct_variations)
                the_best_set = best_sets[0]
                used_indexes = list(
                    range(len(sorted_aligns))
                    if too_much_best_sets else get_used_indexes(best_sets))
                if len(used_indexes) < len(sorted_aligns):
                    ca_output.stdout_f.write(
                        '\t\t\tSkipping redundant alignments after choosing the best set of alignments\n'
                    )
                    for idx in set([
                            idx for idx in range(len(sorted_aligns))
                            if idx not in used_indexes
                    ]):
                        ca_output.stdout_f.write(
                            '\t\tSkipping redundant alignment ' +
                            str(sorted_aligns[idx]) + '\n')

                if is_ambiguous:
                    ca_output.stdout_f.write(
                        '\t\tThis contig has several significant sets of alignments. [An ambiguously mapped contig]\n'
                    )
                    # similar to regular ambiguous contigs, see above
                    ambiguous_contigs += 1
                    ambiguous_contigs_len += ctg_len

                    if qconfig.ambiguity_usage == "none":
                        ambiguous_contigs_extra_bases -= (
                            ctg_len - the_best_set.uncovered)
                        ca_output.stdout_f.write(
                            '\t\tSkipping all alignments in these sets (option --ambiguity-usage is set to "none"):\n'
                        )
                        for idx in used_indexes:
                            ca_output.stdout_f.write(
                                '\t\t\tSkipping alignment ' +
                                str(sorted_aligns[idx]) + '\n')
                        continue
                    elif qconfig.ambiguity_usage == "one":
                        ambiguous_contigs_extra_bases += 0
                        ca_output.stdout_f.write(
                            '\t\tUsing only the very best set (option --ambiguity-usage is set to "one").\n'
                        )
                        if len(the_best_set.indexes) < len(used_indexes):
                            ca_output.stdout_f.write(
                                '\t\tSo, skipping alignments from other sets:\n'
                            )
                            for idx in used_indexes:
                                if idx not in the_best_set.indexes:
                                    ca_output.stdout_f.write(
                                        '\t\t\tSkipping alignment ' +
                                        str(sorted_aligns[idx]) + '\n')
                    elif qconfig.ambiguity_usage == "all":
                        ca_output.stdout_f.write(
                            '\t\tUsing all alignments in these sets (option --ambiguity-usage is set to "all"):\n'
                        )
                        ca_output.stdout_f.write(
                            '\t\t\tThe very best set is shown in details below, the rest are:\n'
                        )
                        for idx, cur_set in enumerate(best_sets[1:]):
                            ca_output.stdout_f.write('\t\t\t\tGroup #%d. Score: %.1f, number of alignments: %d, unaligned bases: %d\n' % \
                                (idx + 2, cur_set.score, len(cur_set.indexes), cur_set.uncovered))
                        if too_much_best_sets:
                            ca_output.stdout_f.write('\t\t\t\tetc...\n')
                        if len(the_best_set.indexes) < len(used_indexes):
                            ambiguous_contigs_extra_bases -= (
                                ctg_len - the_best_set.uncovered)
                            ca_output.stdout_f.write(
                                '\t\t\tList of alignments used in the sets above:\n'
                            )
                            for idx in used_indexes:
                                align = sorted_aligns[idx]
                                ca_output.stdout_f.write(
                                    '\t\tAlignment: %s\n' % str(align))
                                ref_aligns.setdefault(align.ref,
                                                      []).append(align)
                                ambiguous_contigs_extra_bases += align.len2
                                ca_output.coords_filtered_f.write(
                                    str(align) + " ambiguous\n")
                                if idx not in the_best_set.indexes:
                                    ca_output.icarus_out_f.write(
                                        align.icarus_report_str(
                                            is_best=False) + '\n')

                ca_output.stdout_f.write('\t\t\tThe best set is below. Score: %.1f, number of alignments: %d, unaligned bases: %d\n' % \
                                             (the_best_set.score, len(the_best_set.indexes), the_best_set.uncovered))
                real_aligns = [sorted_aligns[i] for i in the_best_set.indexes]

                # main processing part
                if len(real_aligns) == 1:
                    the_only_align = real_aligns[0]

                    #There is only one alignment of this contig to the reference
                    ca_output.coords_filtered_f.write(
                        str(the_only_align) + '\n')
                    aligned_lengths.append(the_only_align.len2)
                    contigs_aligned_lengths[-1] = the_only_align.len2

                    begin, end = the_only_align.start(), the_only_align.end()
                    unaligned_bases = (begin - 1) + (ctg_len - end)
                    aligned_bases_in_contig = ctg_len - unaligned_bases
                    is_partially_unaligned = check_partially_unaligned(
                        real_aligns, ctg_len)
                    if is_partially_unaligned:
                        partially_unaligned += 1
                        partially_unaligned_bases += unaligned_bases
                        if aligned_bases_in_contig < umt * ctg_len:
                            contig_type = 'correct_unaligned'
                        ca_output.stdout_f.write(
                            '\t\tThis contig is partially unaligned. (Aligned %d out of %d bases)\n'
                            % (aligned_bases_in_contig, ctg_len))
                        save_unaligned_info(real_aligns, contig, ctg_len,
                                            unaligned_bases,
                                            unaligned_info_file)
                    ca_output.stdout_f.write('\t\tAlignment: %s\n' %
                                             str(the_only_align))
                    ca_output.icarus_out_f.write(
                        the_only_align.icarus_report_str() + '\n')
                    if is_partially_unaligned:
                        if begin - 1:
                            ca_output.stdout_f.write(
                                '\t\tUnaligned bases: 1 to %d (%d)\n' %
                                (begin - 1, begin - 1))
                        if ctg_len - end:
                            ca_output.stdout_f.write(
                                '\t\tUnaligned bases: %d to %d (%d)\n' %
                                (end + 1, ctg_len, ctg_len - end))
                        if qconfig.is_combined_ref and aligned_bases_in_contig >= umt * ctg_len:
                            check_for_potential_translocation(
                                seq, ctg_len, real_aligns,
                                region_misassemblies, misassemblies_by_ref,
                                ca_output.stdout_f)
                    ref_aligns.setdefault(the_only_align.ref,
                                          []).append(the_only_align)
                else:
                    #Sort real alignments by position on the contig
                    sorted_aligns = sorted(real_aligns,
                                           key=lambda x: (x.end(), x.start()))

                    #There is more than one alignment of this contig to the reference
                    ca_output.stdout_f.write(
                        '\t\tThis contig is misassembled. %d total aligns.\n' %
                        num_aligns)
                    unaligned_bases = the_best_set.uncovered
                    aligned_bases_in_contig = ctg_len - unaligned_bases
                    is_partially_unaligned = check_partially_unaligned(
                        sorted_aligns, ctg_len)
                    if is_partially_unaligned:
                        partially_unaligned += 1
                        partially_unaligned_bases += unaligned_bases
                        if aligned_bases_in_contig >= umt * ctg_len:
                            ca_output.stdout_f.write(
                                '\t\tThis contig is partially unaligned. (Aligned %d out of %d bases)\n'
                                % (aligned_bases_in_contig, ctg_len))
                        save_unaligned_info(sorted_aligns, contig, ctg_len,
                                            unaligned_bases,
                                            unaligned_info_file)

                    if aligned_bases_in_contig < umt * ctg_len:
                        ca_output.stdout_f.write('\t\t\tWarning! This contig is more unaligned than misassembled. ' + \
                            'Contig length is %d and total length of all aligns is %d\n' % (ctg_len, aligned_bases_in_contig))
                        contigs_aligned_lengths[-1] = sum(
                            align.len2 for align in sorted_aligns)
                        for align in sorted_aligns:
                            ca_output.stdout_f.write('\t\tAlignment: %s\n' %
                                                     str(align))
                            ca_output.icarus_out_f.write(
                                align.icarus_report_str() + '\n')
                            ca_output.icarus_out_f.write('unknown\n')
                            ca_output.coords_filtered_f.write(
                                str(align) + '\n')
                            aligned_lengths.append(align.len2)
                            ref_aligns.setdefault(align.ref, []).append(align)

                        half_unaligned_with_misassembly += 1
                        ca_output.stdout_f.write('\t\tUnaligned bases: %d\n' %
                                                 unaligned_bases)
                        contig_type = 'mis_unaligned'
                        ca_output.icarus_out_f.write('\t'.join([
                            'CONTIG', contig,
                            str(ctg_len), contig_type + '\n'
                        ]))
                        ca_output.stdout_f.write('\n')
                        continue

                    ### processing misassemblies
                    is_misassembled, current_mio, indels_info, misassemblies_matched_sv, cnt_misassemblies, contig_aligned_length = \
                        process_misassembled_contig(sorted_aligns, is_cyclic, aligned_lengths, region_misassemblies,
                                                    ref_lens, ref_aligns, ref_features, seq, misassemblies_by_ref,
                                                    istranslocations_by_ref, region_struct_variations, misassemblies_matched_sv,
                                                    ca_output)
                    contigs_aligned_lengths[-1] = contig_aligned_length
                    misassembly_internal_overlap += current_mio
                    total_indels_info += indels_info
                    if is_misassembled:
                        misassembled_contigs[contig] = ctg_len
                        contig_type = 'misassembled'
                        misassemblies_in_contigs[-1] = cnt_misassemblies
                    if is_partially_unaligned:
                        ca_output.stdout_f.write('\t\tUnaligned bases: %d\n' %
                                                 unaligned_bases)
                        if qconfig.is_combined_ref:
                            check_for_potential_translocation(
                                seq, ctg_len, sorted_aligns,
                                region_misassemblies, misassemblies_by_ref,
                                ca_output.stdout_f)
        else:
            #No aligns to this contig
            ca_output.stdout_f.write(
                '\t\tThis contig is unaligned. (%d bp)\n' % ctg_len)
            unaligned_file.write(contig)

            #Increment unaligned contig count and bases
            unaligned += 1
            fully_unaligned_bases += ctg_len
            ca_output.stdout_f.write('\t\tUnaligned bases: %d total: %d\n' %
                                     (ctg_len, fully_unaligned_bases))
            save_unaligned_info([], contig, ctg_len, ctg_len,
                                unaligned_info_file)

        ca_output.icarus_out_f.write('\t'.join(
            ['CONTIG', contig, str(ctg_len), contig_type]) + '\n')
        ca_output.stdout_f.write('\n')

    unaligned_file.close()
    unaligned_info_file.close()
    misassembled_bases = sum(misassembled_contigs.values())

    result = {
        'region_misassemblies':
        region_misassemblies,
        'region_struct_variations':
        region_struct_variations.get_count()
        if region_struct_variations else None,
        'misassemblies_matched_sv':
        misassemblies_matched_sv,
        'misassembled_contigs':
        misassembled_contigs,
        'misassembled_bases':
        misassembled_bases,
        'misassembly_internal_overlap':
        misassembly_internal_overlap,
        'unaligned':
        unaligned,
        'partially_unaligned':
        partially_unaligned,
        'partially_unaligned_bases':
        partially_unaligned_bases,
        'fully_unaligned_bases':
        fully_unaligned_bases,
        'ambiguous_contigs':
        ambiguous_contigs,
        'ambiguous_contigs_extra_bases':
        ambiguous_contigs_extra_bases,
        'ambiguous_contigs_len':
        ambiguous_contigs_len,
        'half_unaligned_with_misassembly':
        half_unaligned_with_misassembly,
        'misassemblies_by_ref':
        misassemblies_by_ref,
        'istranslocations_by_refs':
        istranslocations_by_ref
    }

    return result, ref_aligns, total_indels_info, aligned_lengths, misassembled_contigs, misassemblies_in_contigs, contigs_aligned_lengths
Esempio n. 41
0
def process_single_file(contigs_fpath, index, coords_dirpath, genome_stats_dirpath,
                        reference_chromosomes, ns_by_chromosomes, containers):
    assembly_label = qutils.label_from_fpath(contigs_fpath)
    corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath)
    results = dict()
    ref_lengths = defaultdict(int)
    logger.info('  ' + qutils.index_to_str(index) + assembly_label)

    coords_base_fpath = os.path.join(coords_dirpath, corr_assembly_label + '.coords')
    if qconfig.use_all_alignments:
        coords_fpath = coords_base_fpath
    else:
        coords_fpath = coords_base_fpath + '.filtered'

    if not os.path.isfile(coords_fpath):
        logger.error('File with alignment coords (' + coords_fpath + ') not found! Try to restart QUAST.',
            indent='  ')
        return None, None

    # EXAMPLE:
    #    [S1]     [E1]  |     [S2]     [E2]  |  [LEN 1]  [LEN 2]  |  [% IDY]  | [TAGS]
    #=====================================================================================
    #  338980   339138  |     2298     2134  |      159      165  |    79.76  | gi|48994873|gb|U00096.2|	NODE_0_length_6088
    #  374145   374355  |     2306     2097  |      211      210  |    85.45  | gi|48994873|gb|U00096.2|	NODE_0_length_6088

    genome_mapping = {}
    for chr_name, chr_len in reference_chromosomes.items():
        genome_mapping[chr_name] = [0] * (chr_len + 1)

    contig_tuples = fastaparser.read_fasta(contigs_fpath)  # list of FASTA entries (in tuples: name, seq)
    sorted_contig_tuples = sorted(enumerate(contig_tuples), key=lambda x: len(x[1][1]), reverse=True)
    sorted_contigs_names = []
    contigs_order = []
    for idx, (name, _) in sorted_contig_tuples:
        sorted_contigs_names.append(name)
        contigs_order.append(idx)

    features_in_contigs = [0] * len(sorted_contigs_names)  # for cumulative plots: i-th element is the number of genes in i-th contig
    operons_in_contigs = [0] * len(sorted_contigs_names)
    aligned_blocks_by_contig_name = {} # for gene finding: contig_name --> list of AlignedBlock

    gene_searching_enabled = len(containers)
    if qconfig.memory_efficient and gene_searching_enabled:
        logger.warning('Run QUAST without genes and operons files to reduce memory consumption.')
    if gene_searching_enabled:
        for name in sorted_contigs_names:
            aligned_blocks_by_contig_name[name] = []
    with open(coords_fpath) as coordfile:
        for line in coordfile:
            s1 = int(line.split('|')[0].split()[0])
            e1 = int(line.split('|')[0].split()[1])
            s2 = int(line.split('|')[1].split()[0])
            e2 = int(line.split('|')[1].split()[1])
            contig_name = line.split()[12].strip()
            chr_name = line.split()[11].strip()

            if chr_name not in genome_mapping:
                logger.error("Something went wrong and chromosome names in your coords file (" + coords_base_fpath + ") " \
                             "differ from the names in the reference. Try to remove the file and restart QUAST.")
                return None

            if gene_searching_enabled:
                aligned_blocks_by_contig_name[contig_name].append(AlignedBlock(seqname=chr_name, start=s1, end=e1,
                                                                               contig=contig_name, start_in_contig=s2, end_in_contig=e2))
            for i in range(s1, e1 + 1):
                genome_mapping[chr_name][i] = 1

    for chr_name in genome_mapping.keys():
        for i in ns_by_chromosomes[chr_name]:
            genome_mapping[chr_name][i] = 0
        ref_lengths[chr_name] = sum(genome_mapping[chr_name])

    if qconfig.space_efficient and coords_fpath.endswith('.filtered'):
        os.remove(coords_fpath)

    # counting genome coverage and gaps number
    gaps_count = 0
    if qconfig.analyze_gaps:
        gaps_fpath = os.path.join(genome_stats_dirpath, corr_assembly_label + '_gaps.txt') if not qconfig.space_efficient else '/dev/null'
        with open(gaps_fpath, 'w') as gaps_file:
            for chr_name, chr_len in reference_chromosomes.items():
                gaps_file.write(chr_name + '\n')
                cur_gap_size = 0
                for i in range(1, chr_len + 1):
                    if genome_mapping[chr_name][i] == 1 or i in ns_by_chromosomes[chr_name]:
                        if cur_gap_size >= qconfig.min_gap_size:
                            gaps_count += 1
                            gaps_file.write(str(i - cur_gap_size) + ' ' + str(i - 1) + '\n')
                        cur_gap_size = 0
                    else:
                        cur_gap_size += 1
                if cur_gap_size >= qconfig.min_gap_size:
                    gaps_count += 1
                    gaps_file.write(str(chr_len - cur_gap_size + 1) + ' ' + str(chr_len) + '\n')

    results["gaps_count"] = gaps_count
    results[reporting.Fields.GENES + "_full"] = None
    results[reporting.Fields.GENES + "_partial"] = None
    results[reporting.Fields.OPERONS + "_full"] = None
    results[reporting.Fields.OPERONS + "_partial"] = None

    # finding genes and operons
    for container in containers:
        if not container.region_list:
            continue

        total_full = 0
        total_partial = 0
        found_fpath = os.path.join(genome_stats_dirpath, corr_assembly_label + '_genomic_features_' + container.kind.lower() + '.txt')
        found_file = open(found_fpath, 'w')
        found_file.write('%s\t\t%s\t%s\t%s\t%s\n' % ('ID or #', 'Start', 'End', 'Type', 'Contig'))
        found_file.write('=' * 50 + '\n')

        # 0 - gene is not found,
        # 1 - gene is found,
        # 2 - part of gene is found
        found_list = [0] * len(container.region_list)
        for i, region in enumerate(container.region_list):
            found_list[i] = 0
            gene_blocks = []
            if region.id is None:
                region.id = '# ' + str(region.number + 1)
            for contig_id, name in enumerate(sorted_contigs_names):
                cur_feature_is_found = False
                for cur_block in aligned_blocks_by_contig_name[name]:
                    if cur_block.seqname != region.seqname:
                        continue
                    if region.end <= cur_block.start or cur_block.end <= region.start:
                        continue
                    elif cur_block.start <= region.start and region.end <= cur_block.end:
                        if found_list[i] == 2:  # already found as partial gene
                            total_partial -= 1
                        found_list[i] = 1
                        total_full += 1
                        contig_info = cur_block.format_gene_info(region)
                        found_file.write('%s\t\t%d\t%d\tcomplete\t%s\n' % (region.id, region.start, region.end, contig_info))
                        if container.kind == 'operon':
                            operons_in_contigs[contig_id] += 1  # inc number of found genes/operons in id-th contig
                        else:
                            features_in_contigs[contig_id] += 1

                        cur_feature_is_found = True
                        break
                    elif min(region.end, cur_block.end) - max(region.start, cur_block.start) >= qconfig.min_gene_overlap:
                        if found_list[i] == 0:
                            found_list[i] = 2
                            total_partial += 1
                        gene_blocks.append(cur_block)
                    if cur_feature_is_found:
                        break
                if cur_feature_is_found:
                    break
            # adding info about partially found genes/operons
            if found_list[i] == 2:  # partial gene/operon
                contig_info = ','.join([block.format_gene_info(region) for block in sorted(gene_blocks, key=lambda block: block.start)])
                found_file.write('%s\t\t%d\t%d\tpartial\t%s\n' % (region.id, region.start, region.end, contig_info))

        if container.kind == 'operon':
            results[reporting.Fields.OPERONS + "_full"] = total_full
            results[reporting.Fields.OPERONS + "_partial"] = total_partial
        else:
            if results[reporting.Fields.GENES + "_full"] is None:
                results[reporting.Fields.GENES + "_full"] = 0
                results[reporting.Fields.GENES + "_partial"] = 0
            results[reporting.Fields.GENES + "_full"] += total_full
            results[reporting.Fields.GENES + "_partial"] += total_partial
        found_file.close()

    logger.info('  ' + qutils.index_to_str(index) + 'Analysis is finished.')
    unsorted_features_in_contigs = [features_in_contigs[idx] for idx in contigs_order]
    unsorted_operons_in_contigs = [operons_in_contigs[idx] for idx in contigs_order]

    return ref_lengths, (results, unsorted_features_in_contigs, features_in_contigs, unsorted_operons_in_contigs, operons_in_contigs)
Esempio n. 42
0
REF_MARGINS = 300
REF_FNAME   = "ref.fa"

if len(sys.argv) != 4:
    print "Usage:", sys.argv[0], "reference pos1 pos2"
    sys.exit(0)

pos1 = int(sys.argv[2])
pos2 = int(sys.argv[3])

if pos1 > pos2:
    pos = pos1
    pos1 = pos2
    pos2 = pos

reference = fastaparser.read_fasta(sys.argv[1])[0][1]  # Returns list of FASTA entries (in tuples: name, seq)
if len(reference) < pos2:
    pos2 = len(reference)

ref_file = open(REF_FNAME, 'w')
ref_file.write(">reference\n")
ref_file.write(reference[max(0, pos1 - 1 - REF_MARGINS) : min(len(reference), pos2 + REF_MARGINS)] + "\n")
ref_file.close()

misassembled_site = reference[pos1 - 1 : pos2]
kmers = set()

i = pos1 - 1
while i + KMER_SIZE <= pos2:
    kmers.add(reference[i : i + KMER_SIZE])
    i += 1
Esempio n. 43
0
def correct_fasta(original_fpath, corrected_fpath, min_contig,
                  is_reference=False):
    modified_fasta_entries = []
    used_seq_names = defaultdict(int)
    for first_line, seq in fastaparser.read_fasta(original_fpath):
        if not first_line:
            logger.warning('Skipping ' + original_fpath + ' because >sequence_name field is empty.',
                    indent='    ')
            return False
        if (len(seq) >= min_contig) or is_reference:
            corr_name = correct_name(first_line)
            uniq_name = get_uniq_name(corr_name, used_seq_names)
            used_seq_names[corr_name] += 1

            if not qconfig.no_check:
                # seq to uppercase, because we later looking only uppercase letters
                corr_seq = correct_seq(seq, original_fpath)
                if not corr_seq:
                    return False
            else:
                corr_seq = seq
            modified_fasta_entries.append((uniq_name, corr_seq))

    if not modified_fasta_entries:
        logger.warning('Skipping ' + original_fpath + ' because file is empty.', indent='    ')
        return False

    fastaparser.write_fasta(corrected_fpath, modified_fasta_entries)

    if is_reference:
        ref_len = sum(len(chr_seq) for (chr_name, chr_seq) in modified_fasta_entries)
        if ref_len > qconfig.MAX_REFERENCE_FILE_LENGTH:
            qconfig.splitted_ref = []  # important for MetaQUAST which runs QUAST multiple times
            _, fasta_ext = os.path.splitext(corrected_fpath)
            split_ref_dirpath = os.path.join(os.path.dirname(corrected_fpath), 'split_ref')
            if os.path.exists(split_ref_dirpath):
                shutil.rmtree(split_ref_dirpath, ignore_errors=True)
            os.makedirs(split_ref_dirpath)
            max_len = min(ref_len/qconfig.max_threads, qconfig.MAX_REFERENCE_LENGTH)
            cur_part_len = 0
            cur_part_num = 1
            cur_part_fpath = os.path.join(split_ref_dirpath, "part_%d" % cur_part_num) + fasta_ext

            for (chr_name, chr_seq) in modified_fasta_entries:
                cur_chr_len = len(chr_seq)
                if cur_chr_len > qconfig.MAX_REFERENCE_LENGTH:
                    logger.warning("Skipping chromosome " + chr_name + " because its length is greater than " +
                            str(qconfig.MAX_REFERENCE_LENGTH) + " (Nucmer's constraint).")
                    continue

                cur_part_len += cur_chr_len
                if cur_part_len > max_len and cur_part_len != cur_chr_len:
                    qconfig.splitted_ref.append(cur_part_fpath)
                    cur_part_len = cur_chr_len
                    cur_part_num += 1
                    cur_part_fpath = os.path.join(split_ref_dirpath, "part_%d" % cur_part_num) + fasta_ext
                fastaparser.write_fasta(cur_part_fpath, [(chr_name, chr_seq)], mode='a')
            if cur_part_len > 0:
                qconfig.splitted_ref.append(cur_part_fpath)
            if len(qconfig.splitted_ref) == 0:
                logger.warning("Skipping reference because all of its chromosomes exceeded Nucmer's constraint.")
                return False
    return True
Esempio n. 44
0
def process_single_file(contigs_fpath, index, coords_dirpath,
                        genome_stats_dirpath, reference_chromosomes,
                        ns_by_chromosomes, containers):
    assembly_label = qutils.label_from_fpath(contigs_fpath)
    corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath)
    results = dict()
    ref_lengths = defaultdict(int)
    logger.info('  ' + qutils.index_to_str(index) + assembly_label)

    coords_base_fpath = os.path.join(coords_dirpath,
                                     corr_assembly_label + '.coords')
    if qconfig.use_all_alignments:
        coords_fpath = coords_base_fpath
    else:
        coords_fpath = coords_base_fpath + '.filtered'

    if not os.path.isfile(coords_fpath):
        logger.error('File with alignment coords (' + coords_fpath +
                     ') not found! Try to restart QUAST.',
                     indent='  ')
        return None

    # EXAMPLE:
    #    [S1]     [E1]  |     [S2]     [E2]  |  [LEN 1]  [LEN 2]  |  [% IDY]  | [TAGS]
    #=====================================================================================
    #  338980   339138  |     2298     2134  |      159      165  |    79.76  | gi|48994873|gb|U00096.2|	NODE_0_length_6088
    #  374145   374355  |     2306     2097  |      211      210  |    85.45  | gi|48994873|gb|U00096.2|	NODE_0_length_6088

    genome_mapping = {}
    for chr_name, chr_len in reference_chromosomes.items():
        genome_mapping[chr_name] = [0] * (chr_len + 1)

    contig_tuples = fastaparser.read_fasta(
        contigs_fpath)  # list of FASTA entries (in tuples: name, seq)
    sorted_contig_tuples = sorted(enumerate(contig_tuples),
                                  key=lambda x: len(x[1][1]),
                                  reverse=True)
    sorted_contigs_names = []
    contigs_order = []
    for idx, (name, _) in sorted_contig_tuples:
        sorted_contigs_names.append(name)
        contigs_order.append(idx)

    features_in_contigs = [0] * len(
        sorted_contigs_names
    )  # for cumulative plots: i-th element is the number of genes in i-th contig
    operons_in_contigs = [0] * len(sorted_contigs_names)
    aligned_blocks_by_contig_name = {
    }  # for gene finding: contig_name --> list of AlignedBlock

    gene_searching_enabled = len(containers)
    if qconfig.memory_efficient and gene_searching_enabled:
        logger.warning(
            'Run QUAST without genes and operons files to reduce memory consumption.'
        )
    if gene_searching_enabled:
        for name in sorted_contigs_names:
            aligned_blocks_by_contig_name[name] = []
    with open(coords_fpath) as coordfile:
        for line in coordfile:
            s1 = int(line.split('|')[0].split()[0])
            e1 = int(line.split('|')[0].split()[1])
            s2 = int(line.split('|')[1].split()[0])
            e2 = int(line.split('|')[1].split()[1])
            contig_name = line.split()[12].strip()
            chr_name = line.split()[11].strip()

            if chr_name not in genome_mapping:
                logger.error("Something went wrong and chromosome names in your coords file (" + coords_base_fpath + ") " \
                             "differ from the names in the reference. Try to remove the file and restart QUAST.")
                return None

            if gene_searching_enabled:
                aligned_blocks_by_contig_name[contig_name].append(
                    AlignedBlock(seqname=chr_name,
                                 start=s1,
                                 end=e1,
                                 contig=contig_name,
                                 start_in_contig=s2,
                                 end_in_contig=e2))
            if s2 == 0 and e2 == 0:  # special case: circular genome, contig starts on the end of a chromosome and ends in the beginning
                for i in range(s1, len(genome_mapping[chr_name])):
                    genome_mapping[chr_name][i] = 1
                for i in range(1, e1 + 1):
                    genome_mapping[chr_name][i] = 1
            else:  #if s1 <= e1:
                for i in range(s1, e1 + 1):
                    genome_mapping[chr_name][i] = 1

    for chr_name in genome_mapping.keys():
        for i in ns_by_chromosomes[chr_name]:
            genome_mapping[chr_name][i] = 0
        ref_lengths[chr_name] = sum(genome_mapping[chr_name])

    if qconfig.space_efficient and coords_fpath.endswith('.filtered'):
        os.remove(coords_fpath)

    # counting genome coverage and gaps number
    gaps_count = 0
    if qconfig.analyze_gaps:
        gaps_fpath = os.path.join(
            genome_stats_dirpath, corr_assembly_label +
            '_gaps.txt') if not qconfig.space_efficient else '/dev/null'
        with open(gaps_fpath, 'w') as gaps_file:
            for chr_name, chr_len in reference_chromosomes.items():
                gaps_file.write(chr_name + '\n')
                cur_gap_size = 0
                for i in range(1, chr_len + 1):
                    if genome_mapping[chr_name][
                            i] == 1 or i in ns_by_chromosomes[chr_name]:
                        if cur_gap_size >= qconfig.min_gap_size:
                            gaps_count += 1
                            gaps_file.write(
                                str(i - cur_gap_size) + ' ' + str(i - 1) +
                                '\n')
                        cur_gap_size = 0
                    else:
                        cur_gap_size += 1
                if cur_gap_size >= qconfig.min_gap_size:
                    gaps_count += 1
                    gaps_file.write(
                        str(chr_len - cur_gap_size + 1) + ' ' + str(chr_len) +
                        '\n')

    results["gaps_count"] = gaps_count
    results[reporting.Fields.GENES + "_full"] = None
    results[reporting.Fields.GENES + "_partial"] = None
    results[reporting.Fields.OPERONS + "_full"] = None
    results[reporting.Fields.OPERONS + "_partial"] = None

    # finding genes and operons
    for container in containers:
        if not container.region_list:
            continue

        total_full = 0
        total_partial = 0
        found_fpath = os.path.join(
            genome_stats_dirpath, corr_assembly_label + '_genomic_features_' +
            container.kind.lower() + '.txt')
        found_file = open(found_fpath, 'w')
        found_file.write('%s\t\t%s\t%s\t%s\t%s\n' %
                         ('ID or #', 'Start', 'End', 'Type', 'Contig'))
        found_file.write('=' * 50 + '\n')

        # 0 - gene is not found,
        # 1 - gene is found,
        # 2 - part of gene is found
        found_list = [0] * len(container.region_list)
        for i, region in enumerate(container.region_list):
            found_list[i] = 0
            gene_blocks = []
            if region.id is None:
                region.id = '# ' + str(region.number + 1)
            for contig_id, name in enumerate(sorted_contigs_names):
                cur_feature_is_found = False
                for cur_block in aligned_blocks_by_contig_name[name]:
                    if cur_block.seqname != region.seqname:
                        continue

                    # computing circular genomes
                    if cur_block.start > cur_block.end:
                        blocks = [
                            AlignedBlock(
                                seqname=cur_block.seqname,
                                start=cur_block.start,
                                end=region.end + 1,
                                contig=cur_block.contig_name,
                                start_in_contig=cur_block.start_in_contig),
                            AlignedBlock(seqname=cur_block.seqname,
                                         start=1,
                                         end=cur_block.end,
                                         contig=cur_block.contig_name,
                                         end_in_contig=cur_block.end_in_contig)
                        ]
                        if cur_block.start_in_contig < cur_block.end_in_contig:
                            blocks[0].end_in_contig = blocks[
                                0].start_in_contig + (blocks[0].end -
                                                      blocks[0].start)
                            blocks[1].start_in_contig = blocks[
                                0].end_in_contig + 1
                        else:
                            blocks[0].end_in_contig = blocks[
                                0].start_in_contig - (blocks[1].end -
                                                      blocks[1].start)
                            blocks[1].start_in_contig = blocks[
                                0].end_in_contig - 1
                    else:
                        blocks = [cur_block]

                    for block in blocks:
                        if region.end <= block.start or block.end <= region.start:
                            continue
                        elif block.start <= region.start and region.end <= block.end:
                            if found_list[
                                    i] == 2:  # already found as partial gene
                                total_partial -= 1
                            found_list[i] = 1
                            total_full += 1
                            contig_info = block.format_gene_info(region)
                            found_file.write('%s\t\t%d\t%d\tcomplete\t%s\n' %
                                             (region.id, region.start,
                                              region.end, contig_info))
                            if container.kind == 'operon':
                                operons_in_contigs[
                                    contig_id] += 1  # inc number of found genes/operons in id-th contig
                            else:
                                features_in_contigs[contig_id] += 1

                            cur_feature_is_found = True
                            break
                        elif min(region.end, block.end) - max(
                                region.start,
                                block.start) >= qconfig.min_gene_overlap:
                            if found_list[i] == 0:
                                found_list[i] = 2
                                total_partial += 1
                            gene_blocks.append(block)
                    if cur_feature_is_found:
                        break
                if cur_feature_is_found:
                    break
            # adding info about partially found genes/operons
            if found_list[i] == 2:  # partial gene/operon
                contig_info = ','.join([
                    block.format_gene_info(region)
                    for block in sorted(gene_blocks,
                                        key=lambda block: block.start)
                ])
                found_file.write(
                    '%s\t\t%d\t%d\tpartial\t%s\n' %
                    (region.id, region.start, region.end, contig_info))

        if container.kind == 'operon':
            results[reporting.Fields.OPERONS + "_full"] = total_full
            results[reporting.Fields.OPERONS + "_partial"] = total_partial
        else:
            if results[reporting.Fields.GENES + "_full"] is None:
                results[reporting.Fields.GENES + "_full"] = 0
                results[reporting.Fields.GENES + "_partial"] = 0
            results[reporting.Fields.GENES + "_full"] += total_full
            results[reporting.Fields.GENES + "_partial"] += total_partial
        found_file.close()

    logger.info('  ' + qutils.index_to_str(index) + 'Analysis is finished.')
    unsorted_features_in_contigs = [
        features_in_contigs[idx] for idx in contigs_order
    ]
    unsorted_operons_in_contigs = [
        operons_in_contigs[idx] for idx in contigs_order
    ]

    return ref_lengths, (results, unsorted_features_in_contigs,
                         features_in_contigs, unsorted_operons_in_contigs,
                         operons_in_contigs)
Esempio n. 45
0
def parse_contigs_fpath(contigs_fpath):
    contigs = []
    for name, seq in fastaparser.read_fasta(contigs_fpath):
        contig = Contig(name=name, size=len(seq))
        contigs.append(contig)
    return contigs
Esempio n. 46
0
def correct_meta_references(ref_fpaths,
                            corrected_dirpath,
                            downloaded_refs=False):
    corrected_ref_fpaths = []

    combined_ref_fpath = os.path.join(corrected_dirpath,
                                      qconfig.combined_ref_name)

    chromosomes_by_refs = {}

    def _proceed_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references,
                     ref_fpath):
        seq_fname = ref_name
        seq_fname += ref_fasta_ext

        if total_references > 1:
            corr_seq_fpath = corrected_ref_fpaths[-1]
        else:
            corr_seq_fpath = qutils.unique_corrected_fpath(
                os.path.join(corrected_dirpath, seq_fname))
            corrected_ref_fpaths.append(corr_seq_fpath)
        corr_seq_name = qutils.name_from_fpath(corr_seq_fpath) + '_' + seq_name
        if not qconfig.no_check:
            corr_seq = correct_seq(seq, ref_fpath)
            if not corr_seq:
                return None, None

        fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a')

        contigs_analyzer.ref_labels_by_chromosomes[
            corr_seq_name] = qutils.name_from_fpath(corr_seq_fpath)
        chromosomes_by_refs[ref_name].append((corr_seq_name, len(seq)))

        return corr_seq_name, corr_seq_fpath

    ref_fnames = [os.path.basename(ref_fpath) for ref_fpath in ref_fpaths]
    ref_names = []
    for ref_fname in ref_fnames:
        ref_name, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname)
        ref_names.append(ref_name)

    excluded_ref_fpaths = []
    ref_names = qutils.process_labels(ref_fpaths)
    for ref_fpath, ref_name in zip(ref_fpaths, ref_names):
        total_references = 0
        ref_fname = os.path.basename(ref_fpath)
        _, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname)

        chromosomes_by_refs[ref_name] = []
        used_seq_names = defaultdict(int)

        corr_seq_fpath = None
        for i, (seq_name, seq) in enumerate(fastaparser.read_fasta(ref_fpath)):
            total_references += 1
            seq_name = correct_name(seq_name,
                                    qutils.MAX_CONTIG_NAME - len(ref_name) - 1)
            uniq_seq_name = get_uniq_name(seq_name, used_seq_names)
            used_seq_names[seq_name] += 1
            corr_seq_name, corr_seq_fpath = _proceed_seq(
                uniq_seq_name, seq, ref_name, ref_fasta_ext, total_references,
                ref_fpath)
            if not corr_seq_name:
                break
        if corr_seq_fpath:
            logger.main_info('  ' + ref_fpath + ' ==> ' +
                             qutils.name_from_fpath(corr_seq_fpath) + '')
            fastaparser.write_fasta(combined_ref_fpath,
                                    fastaparser.read_fasta(corr_seq_fpath),
                                    'a')
        elif downloaded_refs:
            logger.warning(
                'Skipping ' + ref_fpath + ' because it'
                ' is empty or contains incorrect sequences (header-only or with non-ACGTN characters)!'
            )
            # cleaning
            for corr_seq_name, _ in chromosomes_by_refs[ref_name]:
                del contigs_analyzer.ref_labels_by_chromosomes[corr_seq_name]
            del chromosomes_by_refs[ref_name]
            corrected_ref_fpaths.pop()
            excluded_ref_fpaths.append(ref_fpath)
        else:
            logger.error(
                'Reference file ' + ref_fpath +
                ' is empty or contains incorrect sequences (header-only or with non-ACGTN characters)!',
                exit_with_code=1)
    for excluded in excluded_ref_fpaths:
        ref_fpaths.remove(excluded)

    if len(chromosomes_by_refs) > 0:
        logger.main_info('  All references were combined in ' +
                         qconfig.combined_ref_name)
    else:
        logger.warning('All references were skipped!')

    return corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_fpaths
Esempio n. 47
0
def correct_meta_references(ref_fpaths, corrected_dirpath, downloaded_refs=False):
    corrected_ref_fpaths = []

    combined_ref_fpath = os.path.join(corrected_dirpath, qconfig.combined_ref_name)

    chromosomes_by_refs = {}

    def _proceed_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath):
        seq_fname = ref_name
        seq_fname += ref_fasta_ext

        if total_references > 1:
            corr_seq_fpath = corrected_ref_fpaths[-1]
        else:
            corr_seq_fpath = qutils.unique_corrected_fpath(os.path.join(corrected_dirpath, seq_fname))
            corrected_ref_fpaths.append(corr_seq_fpath)
        corr_seq_name = qutils.name_from_fpath(corr_seq_fpath) + '_' + seq_name
        if not qconfig.no_check:
            corr_seq = correct_seq(seq, ref_fpath)
            if not corr_seq:
                return None, None

        fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a')

        contigs_analyzer.ref_labels_by_chromosomes[corr_seq_name] = qutils.name_from_fpath(corr_seq_fpath)
        chromosomes_by_refs[ref_name].append((corr_seq_name, len(seq)))

        return corr_seq_name, corr_seq_fpath

    ref_fnames = [os.path.basename(ref_fpath) for ref_fpath in ref_fpaths]
    ref_names = []
    for ref_fname in ref_fnames:
        ref_name, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname)
        ref_names.append(ref_name)

    excluded_ref_fpaths = []
    ref_names = qutils.process_labels(ref_fpaths)
    for ref_fpath, ref_name in zip(ref_fpaths, ref_names):
        total_references = 0
        ref_fname = os.path.basename(ref_fpath)
        _, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname)

        chromosomes_by_refs[ref_name] = []
        used_seq_names = defaultdict(int)

        corr_seq_fpath = None
        for i, (seq_name, seq) in enumerate(fastaparser.read_fasta(ref_fpath)):
            total_references += 1
            seq_name = correct_name(seq_name, qutils.MAX_CONTIG_NAME - len(ref_name) - 1)
            uniq_seq_name = get_uniq_name(seq_name, used_seq_names)
            used_seq_names[seq_name] += 1
            corr_seq_name, corr_seq_fpath = _proceed_seq(uniq_seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath)
            if not corr_seq_name:
                break
        if corr_seq_fpath:
            logger.main_info('  ' + ref_fpath + ' ==> ' + qutils.name_from_fpath(corr_seq_fpath) + '')
            fastaparser.write_fasta(combined_ref_fpath, fastaparser.read_fasta(corr_seq_fpath), 'a')
        elif downloaded_refs:
            logger.warning('Skipping ' + ref_fpath + ' because it'
                           ' is empty or contains incorrect sequences (header-only or with non-ACGTN characters)!')
            # cleaning
            for corr_seq_name, _ in chromosomes_by_refs[ref_name]:
                del contigs_analyzer.ref_labels_by_chromosomes[corr_seq_name]
            del chromosomes_by_refs[ref_name]
            corrected_ref_fpaths.pop()
            excluded_ref_fpaths.append(ref_fpath)
        else:
            logger.error('Reference file ' + ref_fpath +
                         ' is empty or contains incorrect sequences (header-only or with non-ACGTN characters)!',
                         exit_with_code=1)
    for excluded in excluded_ref_fpaths:
        ref_fpaths.remove(excluded)

    if len(chromosomes_by_refs) > 0:
        logger.main_info('  All references were combined in ' + qconfig.combined_ref_name)
    else:
        logger.warning('All references were skipped!')

    return corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_fpaths
Esempio n. 48
0
def align_and_analyze(is_cyclic, index, contigs_fpath, output_dirpath, ref_fpath,
                      old_contigs_fpath, bed_fpath, parallel_by_chr=False, threads=1):
    nucmer_output_dirpath = create_nucmer_output_dir(output_dirpath)
    assembly_label = qutils.label_from_fpath(contigs_fpath)
    corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath)
    nucmer_fpath = join(nucmer_output_dirpath, corr_assembly_label)

    logger.info('  ' + qutils.index_to_str(index) + assembly_label)

    if not qconfig.space_efficient:
        log_out_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stdout')
        log_err_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stderr')
        icarus_out_fpath = join(output_dirpath, qconfig.icarus_report_fname_pattern % corr_assembly_label)
        misassembly_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.mis_contigs.info')
        unaligned_info_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.unaligned.info')
    else:
        log_out_fpath = '/dev/null'
        log_err_fpath = '/dev/null'
        icarus_out_fpath = '/dev/null'
        misassembly_fpath = '/dev/null'
        unaligned_info_fpath = '/dev/null'

    icarus_out_f = open(icarus_out_fpath, 'w')
    icarus_header_cols = ['S1', 'E1', 'S2', 'E2', 'Reference', 'Contig', 'IDY', 'Ambiguous', 'Best_group']
    icarus_out_f.write('\t'.join(icarus_header_cols) + '\n')
    misassembly_f = open(misassembly_fpath, 'w')

    if not qconfig.space_efficient:
        logger.info('  ' + qutils.index_to_str(index) + 'Logging to files ' + log_out_fpath +
                ' and ' + os.path.basename(log_err_fpath) + '...')
    else:
        logger.info('  ' + qutils.index_to_str(index) + 'Logging is disabled.')

    coords_fpath, coords_filtered_fpath, unaligned_fpath, show_snps_fpath, used_snps_fpath = \
        get_nucmer_aux_out_fpaths(nucmer_fpath)

    nucmer_status = align_contigs(nucmer_fpath, ref_fpath, contigs_fpath, old_contigs_fpath, index,
                                  parallel_by_chr, threads, log_out_fpath, log_err_fpath)
    if nucmer_status != NucmerStatus.OK:
        with open(log_err_fpath, 'a') as log_err_f:
            if nucmer_status == NucmerStatus.ERROR:
                logger.error('  ' + qutils.index_to_str(index) +
                         'Failed aligning contigs ' + qutils.label_from_fpath(contigs_fpath) +
                         ' to the reference (non-zero exit code). ' +
                         ('Run with the --debug flag to see additional information.' if not qconfig.debug else ''))
            elif nucmer_status == NucmerStatus.FAILED:
                log_err_f.write(qutils.index_to_str(index) + 'Alignment failed for ' + contigs_fpath + ':' + coords_fpath + 'doesn\'t exist.\n')
                logger.info('  ' + qutils.index_to_str(index) + 'Alignment failed for ' + '\'' + assembly_label + '\'.')
            elif nucmer_status == NucmerStatus.NOT_ALIGNED:
                log_err_f.write(qutils.index_to_str(index) + 'Nothing aligned for ' + contigs_fpath + '\n')
                logger.info('  ' + qutils.index_to_str(index) + 'Nothing aligned for ' + '\'' + assembly_label + '\'.')
        clean_tmp_files(nucmer_fpath)
        return nucmer_status, {}, [], [], []

    log_out_f = open(log_out_fpath, 'a')
    # Loading the alignment files
    log_out_f.write('Parsing coords...\n')
    aligns = {}
    coords_file = open(coords_fpath)
    coords_filtered_file = open(coords_filtered_fpath, 'w')
    coords_filtered_file.write(coords_file.readline())
    coords_filtered_file.write(coords_file.readline())
    for line in coords_file:
        if line.strip() == '':
            break
        assert line[0] != '='
        #Clear leading spaces from nucmer output
        #Store nucmer lines in an array
        mapping = Mapping.from_line(line)
        aligns.setdefault(mapping.contig, []).append(mapping)

    # Loading the reference sequences
    log_out_f.write('Loading reference...\n') # TODO: move up
    references = {}
    ref_features = {}
    for name, seq in fastaparser.read_fasta(ref_fpath):
        name = name.split()[0]  # no spaces in reference header
        references[name] = seq
        log_out_f.write('\tLoaded [%s]\n' % name)

    #Loading the SNP calls
    if qconfig.show_snps:
        log_out_f.write('Loading SNPs...\n')

    used_snps_file = None
    snps = {}
    if qconfig.show_snps:
        prev_line = None
        for line in open_gzipsafe(show_snps_fpath):
            #print "$line";
            line = line.split()
            if not line[0].isdigit():
                continue
            if prev_line and line == prev_line:
                continue
            ref = line[10]
            ctg = line[11]
            pos = int(line[0]) # Kolya: python don't convert int<->str types automatically
            loc = int(line[3]) # Kolya: same as above

            # if (! exists $line[11]) { die "Malformed line in SNP file.  Please check that show-snps has completed succesfully.\n$line\n[$line[9]][$line[10]][$line[11]]\n"; }
            if pos in snps.setdefault(ref, {}).setdefault(ctg, {}):
                snps.setdefault(ref, {}).setdefault(ctg, {})[pos].append(SNP(ref_pos=pos, ctg_pos=loc, ref_nucl=line[1], ctg_nucl=line[2]))
            else:
                snps.setdefault(ref, {}).setdefault(ctg, {})[pos] = [SNP(ref_pos=pos, ctg_pos=loc, ref_nucl=line[1], ctg_nucl=line[2])]
            prev_line = line
        used_snps_file = open_gzipsafe(used_snps_fpath, 'w')

    # Loading the regions (if any)
    regions = {}
    ref_lens = {}
    total_reg_len = 0
    total_regions = 0
    # # TODO: gff
    # log_out_f.write('Loading regions...\n')
    # log_out_f.write('\tNo regions given, using whole reference.\n')
    for name, seq in references.items():
        regions.setdefault(name, []).append([1, len(seq)])
        ref_lens[name] = len(seq)
        total_regions += 1
        total_reg_len += ref_lens[name]
    log_out_f.write('\tTotal Regions: %d\n' % total_regions)
    log_out_f.write('\tTotal Region Length: %d\n' % total_reg_len)

    ca_output = CAOutput(stdout_f=log_out_f, misassembly_f=misassembly_f, coords_filtered_f=coords_filtered_file,
                         used_snps_f=used_snps_file, icarus_out_f=icarus_out_f)

    log_out_f.write('Analyzing contigs...\n')
    result, ref_aligns, total_indels_info, aligned_lengths, misassembled_contigs, misassemblies_in_contigs, aligned_lengths_by_contigs =\
        analyze_contigs(ca_output, contigs_fpath, unaligned_fpath, unaligned_info_fpath, aligns, ref_features, ref_lens, is_cyclic)

    log_out_f.write('Analyzing coverage...\n')
    if qconfig.show_snps:
        log_out_f.write('Writing SNPs into ' + used_snps_fpath + '\n')
    result.update(analyze_coverage(ca_output, regions, ref_aligns, ref_features, snps, total_indels_info))
    result = print_results(contigs_fpath, log_out_f, used_snps_fpath, total_indels_info, result)

    if not qconfig.space_efficient:
        ## outputting misassembled contigs to separate file
        fasta = [(name, seq) for name, seq in fastaparser.read_fasta(contigs_fpath)
                 if name in misassembled_contigs.keys()]
        fastaparser.write_fasta(join(output_dirpath, qutils.name_from_fpath(contigs_fpath) + '.mis_contigs.fa'), fasta)

    if qconfig.is_combined_ref:
        alignment_tsv_fpath = join(output_dirpath, "alignments_" + corr_assembly_label + '.tsv')
        unique_contigs_fpath = join(output_dirpath, qconfig.unique_contigs_fname_pattern % corr_assembly_label)
        logger.debug('  ' + qutils.index_to_str(index) + 'Alignments: ' + qutils.relpath(alignment_tsv_fpath))
        used_contigs = set()
        with open(unique_contigs_fpath, 'w') as unique_contigs_f:
            with open(alignment_tsv_fpath, 'w') as alignment_tsv_f:
                for chr_name, aligns in ref_aligns.items():
                    alignment_tsv_f.write(chr_name)
                    contigs = set([align.contig for align in aligns])
                    for contig in contigs:
                        alignment_tsv_f.write('\t' + contig)

                    if qconfig.is_combined_ref:
                        ref_name = ref_labels_by_chromosomes[chr_name]
                        align_by_contigs = defaultdict(int)
                        for align in aligns:
                            align_by_contigs[align.contig] += align.len2
                        for contig, aligned_len in align_by_contigs.items():
                            if contig in used_contigs:
                                continue
                            used_contigs.add(contig)
                            len_cov_pattern = re.compile(r'_length_([\d\.]+)_cov_([\d\.]+)')
                            if len_cov_pattern.findall(contig):
                                contig_len = len_cov_pattern.findall(contig)[0][0]
                                contig_cov = len_cov_pattern.findall(contig)[0][1]
                                if aligned_len / float(contig_len) > 0.9:
                                    unique_contigs_f.write(ref_name + '\t' + str(aligned_len) + '\t' + contig_cov + '\n')
                    alignment_tsv_f.write('\n')

    close_handlers(ca_output)
    logger.info('  ' + qutils.index_to_str(index) + 'Analysis is finished.')
    logger.debug('')
    clean_tmp_files(nucmer_fpath)
    if not qconfig.no_gzip:
        compress_nucmer_output(logger, nucmer_fpath)
    if not ref_aligns:
        return NucmerStatus.NOT_ALIGNED, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs
    else:
        return NucmerStatus.OK, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs
Esempio n. 49
0
def do(output_dir, ref_fpath, contigs_fpaths, logger):
    logger.print_timestamp()
    kmer_len = qconfig.unique_kmer_len
    logger.main_info('Running analysis based on unique ' + str(kmer_len) +
                     '-mers...')

    checked_assemblies = []
    for contigs_fpath in contigs_fpaths:
        label = qutils.label_from_fpath_for_fname(contigs_fpath)
        if check_kmc_successful_check(output_dir, contigs_fpath,
                                      contigs_fpaths, ref_fpath):
            kmc_stats_fpath = join(output_dir, label + '.stat')
            stats_content = open(kmc_stats_fpath).read().split('\n')
            if len(stats_content) < 1:
                continue
            logger.info('  Using existing results for ' + label + '... ')
            report = reporting.get(contigs_fpath)
            report.add_field(
                reporting.Fields.KMER_COMPLETENESS,
                '%.2f' % float(stats_content[0].strip().split(': ')[-1]))
            if len(stats_content) >= 7:
                corr_len = int(stats_content[1].strip().split(': ')[-1])
                mis_len = int(stats_content[2].strip().split(': ')[-1])
                undef_len = int(stats_content[3].strip().split(': ')[-1])
                total_len = int(stats_content[4].strip().split(': ')[-1])
                translocations = int(stats_content[5].strip().split(': ')[-1])
                relocations = int(stats_content[6].strip().split(': ')[-1])
                report.add_field(reporting.Fields.KMER_CORR_LENGTH,
                                 '%.2f' % (corr_len * 100.0 / total_len))
                report.add_field(reporting.Fields.KMER_MIS_LENGTH,
                                 '%.2f' % (mis_len * 100.0 / total_len))
                report.add_field(reporting.Fields.KMER_UNDEF_LENGTH,
                                 '%.2f' % (undef_len * 100.0 / total_len))
                report.add_field(reporting.Fields.KMER_TRANSLOCATIONS,
                                 translocations)
                report.add_field(reporting.Fields.KMER_RELOCATIONS,
                                 relocations)
                report.add_field(reporting.Fields.KMER_MISASSEMBLIES,
                                 translocations + relocations)
            checked_assemblies.append(contigs_fpath)

    contigs_fpaths = [
        fpath for fpath in contigs_fpaths if fpath not in checked_assemblies
    ]
    if len(contigs_fpaths) == 0:
        save_kmers(output_dir)
        logger.info('Done.')
        return

    if qconfig.platform_name == 'linux_32':
        logger.warning('  Sorry, can\'t run KMC on this platform, skipping...')
        return None

    kmc_dirpath = get_dir_for_download(kmc_dirname, 'KMC',
                                       ['kmc', 'kmc_tools'], logger)
    global kmc_bin_fpath
    global kmc_tools_fpath
    kmc_bin_fpath = download_external_tool('kmc',
                                           kmc_dirpath,
                                           'KMC',
                                           platform_specific=True,
                                           is_executable=True)
    kmc_tools_fpath = download_external_tool('kmc_tools',
                                             kmc_dirpath,
                                             'KMC',
                                             platform_specific=True,
                                             is_executable=True)
    if not exists(kmc_bin_fpath) or not exists(
            kmc_tools_fpath) or not compile_minimap(logger):
        logger.warning('  Sorry, can\'t run KMC, skipping...')
        return None

    logger.info('  Running KMC on reference...')
    if not isdir(output_dir):
        os.makedirs(output_dir)
    log_fpath = join(output_dir, 'kmc.log')
    err_fpath = join(output_dir, 'kmc.err')
    open(log_fpath, 'w').close()
    open(err_fpath, 'w').close()

    tmp_dirpath = join(output_dir, 'tmp')
    if not isdir(tmp_dirpath):
        os.makedirs(tmp_dirpath)
    ref_kmc_out_fpath = count_kmers(tmp_dirpath, ref_fpath, kmer_len,
                                    log_fpath, err_fpath)
    unique_kmers = get_kmers_cnt(tmp_dirpath, ref_kmc_out_fpath, log_fpath,
                                 err_fpath)
    if not unique_kmers:
        logger.warning('KMC failed, check ' + log_fpath + ' and ' + err_fpath +
                       '. Skipping...')
        return

    logger.info('  Analyzing assemblies completeness...')
    kmc_out_fpaths = []
    for id, contigs_fpath in enumerate(contigs_fpaths):
        assembly_label = qutils.label_from_fpath(contigs_fpath)
        logger.info('    ' + qutils.index_to_str(id) + assembly_label)

        report = reporting.get(contigs_fpath)
        kmc_out_fpath = count_kmers(tmp_dirpath, contigs_fpath, kmer_len,
                                    log_fpath, err_fpath)
        intersect_out_fpath = intersect_kmers(
            tmp_dirpath, [ref_kmc_out_fpath, kmc_out_fpath], log_fpath,
            err_fpath)
        matched_kmers = get_kmers_cnt(tmp_dirpath, intersect_out_fpath,
                                      log_fpath, err_fpath)
        completeness = matched_kmers * 100.0 / unique_kmers
        report.add_field(reporting.Fields.KMER_COMPLETENESS,
                         '%.2f' % completeness)
        kmc_out_fpaths.append(intersect_out_fpath)

    logger.info('  Analyzing assemblies correctness...')
    ref_contigs = [name for name, _ in read_fasta(ref_fpath)]
    logger.info('    Downsampling k-mers...')
    ref_kmers, downsampled_kmers_fpath = downsample_kmers(
        tmp_dirpath, ref_fpath, ref_kmc_out_fpath, kmer_len, log_fpath,
        err_fpath)
    for id, (contigs_fpath,
             kmc_db_fpath) in enumerate(zip(contigs_fpaths, kmc_out_fpaths)):
        assembly_label = qutils.label_from_fpath(contigs_fpath)
        logger.info('    ' + qutils.index_to_str(id) + assembly_label)

        report = reporting.get(contigs_fpath)
        corr_len = None
        mis_len = None
        undef_len = None
        translocations, relocations = None, None
        total_len = 0
        contig_lens = dict()
        for name, seq in read_fasta(contigs_fpath):
            total_len += len(seq)
            contig_lens[name] = len(seq)

        if len(ref_contigs) > MAX_REF_CONTIGS_NUM:
            logger.warning(
                'Reference is too fragmented. Scaffolding accuracy will not be assessed.'
            )
        else:
            corr_len = 0
            mis_len = 0
            kmers_by_contig, kmers_pos_by_contig = align_kmers(
                tmp_dirpath, contigs_fpath, downsampled_kmers_fpath, err_fpath,
                qconfig.max_threads)
            is_cyclic = qconfig.prokaryote and not qconfig.check_for_fragmented_ref
            cyclic_ref_lens = report.get_field(
                reporting.Fields.REFLEN) if is_cyclic else None
            translocations = 0
            relocations = 0
            with open(
                    join(
                        tmp_dirpath,
                        qutils.label_from_fpath_for_fname(contigs_fpath) +
                        '.misjoins.txt'), 'w') as out:
                for contig in kmers_by_contig.keys():
                    contig_markers = []
                    prev_pos, prev_ref_pos, prev_chrom, marker = None, None, None, None
                    for pos, kmer in sorted(zip(kmers_pos_by_contig[contig],
                                                kmers_by_contig[contig]),
                                            key=lambda x: x[0]):
                        ref_chrom, ref_pos = ref_kmers[kmer]
                        if prev_pos and prev_chrom:
                            if prev_chrom == ref_chrom and abs(
                                    abs(pos - prev_pos) /
                                    abs(ref_pos - prev_ref_pos) - 1) <= 0.05:
                                marker = (pos, ref_pos, ref_chrom)
                            elif marker:
                                contig_markers.append(marker)
                                pos, ref_pos, ref_chrom, marker = None, None, None, None
                        prev_pos, prev_ref_pos, prev_chrom = pos, ref_pos, ref_chrom
                    if marker:
                        contig_markers.append(marker)
                    prev_pos, prev_ref_pos, prev_chrom = None, None, None
                    is_misassembled = False
                    for marker in contig_markers:
                        pos, ref_pos, ref_chrom = marker
                        if prev_pos and prev_chrom:
                            if ref_chrom != prev_chrom:
                                translocations += 1
                                out.write(
                                    'Translocation in %s: %s %d | %s %d\n' %
                                    (contig, prev_chrom, prev_pos, ref_chrom,
                                     pos))
                                is_misassembled = True
                            elif _get_dist_inconstistency(
                                    pos, prev_pos, ref_pos, prev_ref_pos,
                                    cyclic_ref_lens) > EXT_RELOCATION_SIZE:
                                relocations += 1
                                out.write(
                                    'Relocation in %s: %d (%d) | %d (%d)\n' %
                                    (contig, prev_pos, prev_ref_pos, pos,
                                     ref_pos))
                                is_misassembled = True
                        prev_pos, prev_ref_pos, prev_chrom = pos, ref_pos, ref_chrom
                    if is_misassembled:
                        mis_len += contig_lens[contig]
                    elif len(contig_markers) > 0:
                        corr_len += contig_lens[contig]
            undef_len = total_len - corr_len - mis_len
            report.add_field(reporting.Fields.KMER_CORR_LENGTH,
                             '%.2f' % (corr_len * 100.0 / total_len))
            report.add_field(reporting.Fields.KMER_MIS_LENGTH,
                             '%.2f' % (mis_len * 100.0 / total_len))
            report.add_field(reporting.Fields.KMER_UNDEF_LENGTH,
                             '%.2f' % (undef_len * 100.0 / total_len))
            report.add_field(reporting.Fields.KMER_TRANSLOCATIONS,
                             translocations)
            report.add_field(reporting.Fields.KMER_RELOCATIONS, relocations)
            report.add_field(reporting.Fields.KMER_MISASSEMBLIES,
                             translocations + relocations)

        create_kmc_stats_file(
            output_dir, contigs_fpath, ref_fpath,
            report.get_field(reporting.Fields.KMER_COMPLETENESS), corr_len,
            mis_len, undef_len, total_len, translocations, relocations)
    save_kmers(output_dir)
    if not qconfig.debug:
        shutil.rmtree(tmp_dirpath)
    logger.info('Done.')
Esempio n. 50
0
def align_and_analyze(is_cyclic, index, contigs_fpath, output_dirpath, ref_fpath,
                      reference_chromosomes, ns_by_chromosomes, old_contigs_fpath, bed_fpath, threads=1):
    tmp_output_dirpath = create_minimap_output_dir(output_dirpath)
    assembly_label = qutils.label_from_fpath(contigs_fpath)
    corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath)
    out_basename = join(tmp_output_dirpath, corr_assembly_label)

    logger.info('  ' + qutils.index_to_str(index) + assembly_label)

    if not qconfig.space_efficient:
        log_out_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stdout')
        log_err_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stderr')
        icarus_out_fpath = join(output_dirpath, qconfig.icarus_report_fname_pattern % corr_assembly_label)
        misassembly_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.mis_contigs.info')
        unaligned_info_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.unaligned.info')
    else:
        log_out_fpath = '/dev/null'
        log_err_fpath = '/dev/null'
        icarus_out_fpath = '/dev/null'
        misassembly_fpath = '/dev/null'
        unaligned_info_fpath = '/dev/null'

    icarus_out_f = open(icarus_out_fpath, 'w')
    icarus_header_cols = ['S1', 'E1', 'S2', 'E2', 'Reference', 'Contig', 'IDY', 'Ambiguous', 'Best_group']
    icarus_out_f.write('\t'.join(icarus_header_cols) + '\n')
    misassembly_f = open(misassembly_fpath, 'w')

    if not qconfig.space_efficient:
        logger.info('  ' + qutils.index_to_str(index) + 'Logging to files ' + log_out_fpath +
                ' and ' + os.path.basename(log_err_fpath) + '...')
    else:
        logger.info('  ' + qutils.index_to_str(index) + 'Logging is disabled.')

    coords_fpath, coords_filtered_fpath, unaligned_fpath, used_snps_fpath = get_aux_out_fpaths(out_basename)
    status = align_contigs(coords_fpath, out_basename, ref_fpath, contigs_fpath, old_contigs_fpath, index, threads,
                           log_out_fpath, log_err_fpath)
    if status != AlignerStatus.OK:
        with open(log_err_fpath, 'a') as log_err_f:
            if status == AlignerStatus.ERROR:
                logger.error('  ' + qutils.index_to_str(index) +
                         'Failed aligning contigs ' + qutils.label_from_fpath(contigs_fpath) +
                         ' to the reference (non-zero exit code). ' +
                         ('Run with the --debug flag to see additional information.' if not qconfig.debug else ''))
            elif status == AlignerStatus.FAILED:
                log_err_f.write(qutils.index_to_str(index) + 'Alignment failed for ' + contigs_fpath + ':' + coords_fpath + 'doesn\'t exist.\n')
                logger.info('  ' + qutils.index_to_str(index) + 'Alignment failed for ' + '\'' + assembly_label + '\'.')
            elif status == AlignerStatus.NOT_ALIGNED:
                log_err_f.write(qutils.index_to_str(index) + 'Nothing aligned for ' + contigs_fpath + '\n')
                logger.info('  ' + qutils.index_to_str(index) + 'Nothing aligned for ' + '\'' + assembly_label + '\'.')
        return status, {}, [], [], []

    log_out_f = open(log_out_fpath, 'a')
    # Loading the alignment files
    log_out_f.write('Parsing coords...\n')
    aligns = {}
    with open(coords_fpath) as coords_file:
        for line in coords_file:
            mapping = Mapping.from_line(line)
            aligns.setdefault(mapping.contig, []).append(mapping)

    # Loading the reference sequences
    log_out_f.write('Loading reference...\n') # TODO: move up
    ref_features = {}

    # Loading the regions (if any)
    regions = {}
    total_reg_len = 0
    total_regions = 0
    # # TODO: gff
    # log_out_f.write('Loading regions...\n')
    # log_out_f.write('\tNo regions given, using whole reference.\n')
    for name, seq_len in reference_chromosomes.items():
        log_out_f.write('\tLoaded [%s]\n' % name)
        regions.setdefault(name, []).append([1, seq_len])
        total_regions += 1
        total_reg_len += seq_len
    log_out_f.write('\tTotal Regions: %d\n' % total_regions)
    log_out_f.write('\tTotal Region Length: %d\n' % total_reg_len)

    ca_output = CAOutput(stdout_f=log_out_f, misassembly_f=misassembly_f, coords_filtered_f=open(coords_filtered_fpath, 'w'),
                         icarus_out_f=icarus_out_f)

    log_out_f.write('Analyzing contigs...\n')
    result, ref_aligns, total_indels_info, aligned_lengths, misassembled_contigs, misassemblies_in_contigs, aligned_lengths_by_contigs =\
        analyze_contigs(ca_output, contigs_fpath, unaligned_fpath, unaligned_info_fpath, aligns, ref_features, reference_chromosomes, is_cyclic)

    log_out_f.write('Analyzing coverage...\n')
    if qconfig.show_snps:
        log_out_f.write('Writing SNPs into ' + used_snps_fpath + '\n')
    total_aligned_bases, indels_info = analyze_coverage(ref_aligns, reference_chromosomes, ns_by_chromosomes, used_snps_fpath)
    total_indels_info += indels_info
    cov_stats = {'SNPs': total_indels_info.mismatches, 'indels_list': total_indels_info.indels_list, 'total_aligned_bases': total_aligned_bases}
    result.update(cov_stats)
    result = print_results(contigs_fpath, log_out_f, used_snps_fpath, total_indels_info, result)

    if not qconfig.space_efficient:
        ## outputting misassembled contigs to separate file
        fasta = [(name, seq) for name, seq in fastaparser.read_fasta(contigs_fpath)
                 if name in misassembled_contigs.keys()]
        fastaparser.write_fasta(join(output_dirpath, qutils.name_from_fpath(contigs_fpath) + '.mis_contigs.fa'), fasta)

    if qconfig.is_combined_ref:
        alignment_tsv_fpath = join(output_dirpath, "alignments_" + corr_assembly_label + '.tsv')
        unique_contigs_fpath = join(output_dirpath, qconfig.unique_contigs_fname_pattern % corr_assembly_label)
        logger.debug('  ' + qutils.index_to_str(index) + 'Alignments: ' + qutils.relpath(alignment_tsv_fpath))
        used_contigs = set()
        with open(unique_contigs_fpath, 'w') as unique_contigs_f:
            with open(alignment_tsv_fpath, 'w') as alignment_tsv_f:
                for chr_name, aligns in ref_aligns.items():
                    alignment_tsv_f.write(chr_name)
                    contigs = set([align.contig for align in aligns])
                    for contig in contigs:
                        alignment_tsv_f.write('\t' + contig)

                    if qconfig.is_combined_ref:
                        ref_name = ref_labels_by_chromosomes[chr_name]
                        align_by_contigs = defaultdict(int)
                        for align in aligns:
                            align_by_contigs[align.contig] += align.len2
                        for contig, aligned_len in align_by_contigs.items():
                            if contig in used_contigs:
                                continue
                            used_contigs.add(contig)
                            len_cov_pattern = re.compile(r'_length_([\d\.]+)_cov_([\d\.]+)')
                            if len_cov_pattern.findall(contig):
                                contig_len = len_cov_pattern.findall(contig)[0][0]
                                contig_cov = len_cov_pattern.findall(contig)[0][1]
                                if aligned_len / float(contig_len) > 0.9:
                                    unique_contigs_f.write(ref_name + '\t' + str(aligned_len) + '\t' + contig_cov + '\n')
                    alignment_tsv_f.write('\n')

    close_handlers(ca_output)
    logger.info('  ' + qutils.index_to_str(index) + 'Analysis is finished.')
    logger.debug('')
    if not ref_aligns:
        return AlignerStatus.NOT_ALIGNED, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs
    else:
        return AlignerStatus.OK, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs
Esempio n. 51
0
def do(output_dir, ref_fpath, contigs_fpaths, logger):
    logger.print_timestamp()
    logger.main_info('Running analysis based on unique 101-mers...')
    addsitedir(jellyfish_python_dirpath)
    try:
        compile_jellyfish(logger)
        import jellyfish
        try:
            import imp
            imp.reload(jellyfish)
        except:
            reload(jellyfish)
        jellyfish.MerDNA.k(KMERS_LEN)
    except:
        logger.warning('Failed unique 101-mers analysis.')
        return

    checked_assemblies = []
    for contigs_fpath in contigs_fpaths:
        label = qutils.label_from_fpath_for_fname(contigs_fpath)
        if check_jf_successful_check(output_dir, contigs_fpath, contigs_fpaths,
                                     ref_fpath):
            jf_stats_fpath = join(output_dir, label + '.stat')
            stats_content = open(jf_stats_fpath).read().split('\n')
            if len(stats_content) < 4:
                continue
            logger.info('  Using existing results for ' + label + '... ')
            report = reporting.get(contigs_fpath)
            report.add_field(
                reporting.Fields.KMER_COMPLETENESS,
                '%.2f' % float(stats_content[0].strip().split(': ')[-1]))
            report.add_field(
                reporting.Fields.KMER_SCAFFOLDS_ONE_CHROM,
                '%.2f' % float(stats_content[1].strip().split(': ')[-1]))
            report.add_field(
                reporting.Fields.KMER_SCAFFOLDS_MULTI_CHROM,
                '%.2f' % float(stats_content[2].strip().split(': ')[-1]))
            report.add_field(
                reporting.Fields.KMER_SCAFFOLDS_NONE_CHROM,
                '%.2f' % float(stats_content[3].strip().split(': ')[-1]))
            checked_assemblies.append(contigs_fpath)

    contigs_fpaths = [
        fpath for fpath in contigs_fpaths if fpath not in checked_assemblies
    ]
    if len(contigs_fpaths) == 0:
        logger.info('Done.')
        return

    logger.info('Running Jellyfish on reference...')
    jf_out_fpath = join(output_dir, basename(ref_fpath) + '.jf')
    qutils.call_subprocess([
        jellyfish_bin_fpath, 'count', '-m', '101', '-U', '1', '-s',
        str(getsize(ref_fpath)), '-o', jf_out_fpath, '-t',
        str(qconfig.max_threads), ref_fpath
    ])
    ref_kmers = jellyfish.ReadMerFile(jf_out_fpath)
    os.remove(jf_out_fpath)

    logger.info('Running Jellyfish on assemblies...')
    contigs_kmers = []
    for contigs_fpath in contigs_fpaths:
        jf_out_fpath = join(output_dir, basename(contigs_fpath) + '.jf')
        qutils.call_subprocess([
            jellyfish_bin_fpath, 'count', '-m', '101', '-U', '1', '-s',
            str(getsize(contigs_fpath)), '-o', jf_out_fpath, '-t',
            str(qconfig.max_threads), contigs_fpath
        ])
        contigs_kmers.append(jellyfish.QueryMerFile(jf_out_fpath))
        os.remove(jf_out_fpath)

    logger.info('Analyzing completeness and accuracy of assemblies...')
    unique_kmers = 0
    matched_kmers = defaultdict(int)
    shared_kmers = set()
    kmer_i = 0
    for kmer, count in ref_kmers:
        unique_kmers += 1
        matches = 0
        for idx in range(len(contigs_fpaths)):
            if contigs_kmers[idx][kmer]:
                matched_kmers[idx] += 1
                matches += 1
        if matches == len(contigs_fpaths):
            if kmer_i % 100 == 0:
                shared_kmers.add(str(kmer))
            kmer_i += 1

    for idx, contigs_fpath in enumerate(contigs_fpaths):
        report = reporting.get(contigs_fpath)
        completeness = matched_kmers[idx] * 100.0 / unique_kmers
        report.add_field(reporting.Fields.KMER_COMPLETENESS,
                         '%.2f' % completeness)

    shared_kmers_by_chrom = dict()
    ref_contigs = dict((name, seq) for name, seq in read_fasta(ref_fpath))
    for name, seq in ref_contigs.items():
        seq_kmers = jellyfish.string_mers(seq)
        for kmer in seq_kmers:
            if str(kmer) in shared_kmers:
                shared_kmers_by_chrom[str(kmer)] = name

    for contigs_fpath in contigs_fpaths:
        report = reporting.get(contigs_fpath)
        len_map_to_one_chrom = 0
        len_map_to_multi_chrom = 0
        total_len = 0

        for name, seq in read_fasta(contigs_fpath):
            total_len += len(seq)
            seq_kmers = jellyfish.string_mers(seq)
            chrom_markers = []
            for kmer in seq_kmers:
                kmer_str = str(kmer)
                if kmer_str in shared_kmers_by_chrom:
                    chrom = shared_kmers_by_chrom[kmer_str]
                    chrom_markers.append(chrom)
            if len(chrom_markers) < MIN_MARKERS:
                continue
            if len(set(chrom_markers)) == 1:
                len_map_to_one_chrom += len(seq)
            else:
                len_map_to_multi_chrom += len(seq)

        len_map_to_none_chrom = total_len - len_map_to_one_chrom - len_map_to_multi_chrom
        report.add_field(reporting.Fields.KMER_SCAFFOLDS_ONE_CHROM,
                         '%.2f' % (len_map_to_one_chrom * 100.0 / total_len))
        report.add_field(reporting.Fields.KMER_SCAFFOLDS_MULTI_CHROM,
                         '%.2f' % (len_map_to_multi_chrom * 100.0 / total_len))
        report.add_field(reporting.Fields.KMER_SCAFFOLDS_NONE_CHROM,
                         '%.2f' % (len_map_to_none_chrom * 100.0 / total_len))

        create_jf_stats_file(
            output_dir, contigs_fpath, contigs_fpaths, ref_fpath,
            report.get_field(reporting.Fields.KMER_COMPLETENESS),
            len_map_to_one_chrom, len_map_to_multi_chrom,
            len_map_to_none_chrom)

    logger.info('Done.')
Esempio n. 52
0
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath,
       genes_fpaths, operons_fpaths, detailed_contigs_reports_dirpath, genome_stats_dirpath):

    nucmer_path_dirpath = os.path.join(detailed_contigs_reports_dirpath, 'nucmer_output')
    from quast_libs import search_references_meta
    if search_references_meta.is_quast_first_run:
        nucmer_path_dirpath = os.path.join(nucmer_path_dirpath, 'raw')

    logger.print_timestamp()
    logger.main_info('Running Genome analyzer...')

    if not os.path.isdir(genome_stats_dirpath):
        os.mkdir(genome_stats_dirpath)

    reference_chromosomes = {}
    genome_size = 0
    for name, seq in fastaparser.read_fasta(ref_fpath):
        chr_name = name.split()[0]
        chr_len = len(seq)
        genome_size += chr_len
        reference_chromosomes[chr_name] = chr_len

    # reading genome size
    # genome_size = fastaparser.get_lengths_from_fastafile(reference)[0]
    # reading reference name
    # >gi|48994873|gb|U00096.2| Escherichia coli str. K-12 substr. MG1655, complete genome
    # ref_file = open(reference, 'r')
    # reference_name = ref_file.readline().split()[0][1:]
    # ref_file.close()

    # RESULTS file
    result_fpath = genome_stats_dirpath + '/genome_info.txt'
    res_file = open(result_fpath, 'w')

    genes_container = FeatureContainer(genes_fpaths, 'gene')
    operons_container = FeatureContainer(operons_fpaths, 'operon')
    for container in [genes_container, operons_container]:
        if not container.fpaths:
            logger.notice('No file with ' + container.kind + 's provided. '
                          'Use the -' + container.kind[0].capitalize() + ' option '
                          'if you want to specify it.', indent='  ')
            continue

        for fpath in container.fpaths:
            container.region_list += genes_parser.get_genes_from_file(fpath, container.kind)

        if len(container.region_list) == 0:
            logger.warning('No ' + container.kind + 's were loaded.', indent='  ')
            res_file.write(container.kind + 's loaded: ' + 'None' + '\n')
        else:
            logger.info('  Loaded ' + str(len(container.region_list)) + ' ' + container.kind + 's')
            res_file.write(container.kind + 's loaded: ' + str(len(container.region_list)) + '\n')
            container.chr_names_dict = chromosomes_names_dict(container.kind, container.region_list, list(reference_chromosomes.keys()))

    for contigs_fpath in aligned_contigs_fpaths:
        report = reporting.get(contigs_fpath)
        if genes_container.fpaths:
            report.add_field(reporting.Fields.REF_GENES, len(genes_container.region_list))
        if operons_container.fpaths:
            report.add_field(reporting.Fields.REF_OPERONS, len(operons_container.region_list))

    # for cumulative plots:
    files_genes_in_contigs = {}   #  "filename" : [ genes in sorted contigs (see below) ]
    files_operons_in_contigs = {}

    # for histograms
    genome_mapped = []
    full_found_genes = []
    full_found_operons = []

    # process all contig files
    num_nf_errors = logger._num_nf_errors
    n_jobs = min(len(aligned_contigs_fpaths), qconfig.max_threads)
    if is_python2():
        from joblib import Parallel, delayed
    else:
        from joblib3 import Parallel, delayed
    process_results = Parallel(n_jobs=n_jobs)(delayed(process_single_file)(
        contigs_fpath, index, nucmer_path_dirpath, genome_stats_dirpath,
        reference_chromosomes, genes_container, operons_container)
        for index, contigs_fpath in enumerate(aligned_contigs_fpaths))
    num_nf_errors += len([res for res in process_results if res is None])
    logger._num_nf_errors = num_nf_errors
    process_results = [res for res in process_results if res]
    if not process_results:
        logger.main_info('Genome analyzer failed for all the assemblies.')
        res_file.close()
        return

    ref_lengths = [process_results[i][0] for i in range(len(process_results))]
    results_genes_operons_tuples = [process_results[i][1] for i in range(len(process_results))]
    for ref in reference_chromosomes:
        ref_lengths_by_contigs[ref] = [ref_lengths[i][ref] for i in range(len(ref_lengths))]
    res_file.write('reference chromosomes:\n')
    for chr_name, chr_len in reference_chromosomes.items():
        aligned_len = max(ref_lengths_by_contigs[chr_name])
        res_file.write('\t' + chr_name + ' (total length: ' + str(chr_len) + ' bp, maximal covered length: ' + str(aligned_len) + ' bp)\n')
    res_file.write('\n')
    res_file.write('total genome size: ' + str(genome_size) + '\n\n')
    res_file.write('gap min size: ' + str(qconfig.min_gap_size) + '\n')
    res_file.write('partial gene/operon min size: ' + str(qconfig.min_gene_overlap) + '\n\n')
    # header
    # header
    res_file.write('\n\n')
    res_file.write('%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n'
        % ('assembly', 'genome', 'duplication', 'gaps', 'genes', 'partial', 'operons', 'partial'))
    res_file.write('%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n'
        % ('', 'fraction', 'ratio', 'number', '', 'genes', '', 'operons'))
    res_file.write('================================================================================================================\n')

    for contigs_fpath, (results, genes_in_contigs, operons_in_contigs) in zip(aligned_contigs_fpaths, results_genes_operons_tuples):
        assembly_name = qutils.name_from_fpath(contigs_fpath)

        files_genes_in_contigs[contigs_fpath] = genes_in_contigs
        files_operons_in_contigs[contigs_fpath] = operons_in_contigs
        full_found_genes.append(sum(genes_in_contigs))
        full_found_operons.append(sum(operons_in_contigs))

        covered_bp = results["covered_bp"]
        gaps_count = results["gaps_count"]
        genes_full = results[reporting.Fields.GENES + "_full"]
        genes_part = results[reporting.Fields.GENES + "_partial"]
        operons_full = results[reporting.Fields.OPERONS + "_full"]
        operons_part = results[reporting.Fields.OPERONS + "_partial"]

        report = reporting.get(contigs_fpath)
        genome_fraction = float(covered_bp) * 100 / float(genome_size)
        duplication_ratio = (report.get_field(reporting.Fields.TOTALLEN) +
                             report.get_field(reporting.Fields.MISINTERNALOVERLAP) +
                             report.get_field(reporting.Fields.AMBIGUOUSEXTRABASES) -
                             report.get_field(reporting.Fields.UNALIGNEDBASES)) /\
                            ((genome_fraction / 100.0) * float(genome_size))

        res_file.write('%-25s| %-10s| %-12s| %-10s|'
        % (assembly_name[:24], '%3.5f%%' % genome_fraction, '%1.5f' % duplication_ratio, gaps_count))

        report.add_field(reporting.Fields.MAPPEDGENOME, '%.3f' % genome_fraction)
        report.add_field(reporting.Fields.DUPLICATION_RATIO, '%.3f' % duplication_ratio)
        genome_mapped.append(genome_fraction)

        for (field, full, part) in [(reporting.Fields.GENES, genes_full, genes_part),
            (reporting.Fields.OPERONS, operons_full, operons_part)]:
            if full is None and part is None:
                res_file.write(' %-10s| %-10s|' % ('-', '-'))
            else:
                res_file.write(' %-10s| %-10s|' % (full, part))
                report.add_field(field, '%s + %s part' % (full, part))
        res_file.write('\n')
    res_file.close()

    if genes_container.region_list:
        ref_genes_num = len(genes_container.region_list)
    else:
        ref_genes_num = None

    if operons_container.region_list:
        ref_operons_num = len(operons_container.region_list)
    else:
        ref_operons_num = None

    # saving json
    if json_output_dirpath:
        if genes_container.region_list:
            json_saver.save_features_in_contigs(json_output_dirpath, aligned_contigs_fpaths, 'genes', files_genes_in_contigs, ref_genes_num)
        if operons_container.region_list:
            json_saver.save_features_in_contigs(json_output_dirpath, aligned_contigs_fpaths, 'operons', files_operons_in_contigs, ref_operons_num)

    if qconfig.html_report:
        from quast_libs.html_saver import html_saver
        if genes_container.region_list:
            html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'genes', files_genes_in_contigs, ref_genes_num)
        if operons_container.region_list:
            html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'operons', files_operons_in_contigs, ref_operons_num)

    if qconfig.draw_plots:
        # cumulative plots:
        from . import plotter
        if genes_container.region_list:
            plotter.genes_operons_plot(len(genes_container.region_list), aligned_contigs_fpaths, files_genes_in_contigs,
                genome_stats_dirpath + '/genes_cumulative_plot', 'genes')
            plotter.histogram(aligned_contigs_fpaths, full_found_genes, genome_stats_dirpath + '/complete_genes_histogram',
                '# complete genes')
        if operons_container.region_list:
            plotter.genes_operons_plot(len(operons_container.region_list), aligned_contigs_fpaths, files_operons_in_contigs,
                genome_stats_dirpath + '/operons_cumulative_plot', 'operons')
            plotter.histogram(aligned_contigs_fpaths, full_found_operons, genome_stats_dirpath + '/complete_operons_histogram',
                '# complete operons')
        plotter.histogram(aligned_contigs_fpaths, genome_mapped, genome_stats_dirpath + '/genome_fraction_histogram',
            'Genome fraction, %', top_value=100)

    logger.main_info('Done.')
    return [genes_container, operons_container]
Esempio n. 53
0
    print("Usage: " + sys.argv[0] + " <input fasta (scaffolds)> (to get stats on sizes of Ns regions)")	
    print("Usage: " + sys.argv[0] + " <input fasta (scaffolds)> <THRESHOLD> <output fasta (contigs)> (to break contigs on Ns regions of size >= THRESHOLD)")	
    sys.exit()

BREAK_SCAFFOLDS = False
if len(sys.argv) == 4:
    BREAK_SCAFFOLDS = True

N_NUMBER = None
counter = 0
if BREAK_SCAFFOLDS:
    N_NUMBER = int(sys.argv[2])

sizes_of_Ns_regions = dict()
new_fasta = []
for id, (name, seq) in enumerate(fastaparser.read_fasta(sys.argv[1])): 
    i = 0
    cur_contig_number = 1
    cur_contig_start = 0
    while (i < len(seq)) and (seq.find("N", i) != -1):
        start = seq.find("N", i)
        end = start + 1
        while (end != len(seq)) and (seq[end] == 'N'):
            end += 1        

        i = end + 1
        if BREAK_SCAFFOLDS and (end - start) >= N_NUMBER:
            new_fasta.append((name.split()[0] + "_" + str(cur_contig_number), seq[cur_contig_start:start]))
            cur_contig_number += 1
            cur_contig_start = end
Esempio n. 54
0
def process_single_file(contigs_fpath, index, nucmer_path_dirpath, genome_stats_dirpath,
                        reference_chromosomes, genes_container, operons_container):
    assembly_label = qutils.label_from_fpath(contigs_fpath)
    corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath)
    results = dict()
    ref_lengths = {}
    logger.info('  ' + qutils.index_to_str(index) + assembly_label)

    nucmer_base_fpath = os.path.join(nucmer_path_dirpath, corr_assembly_label + '.coords')
    if qconfig.use_all_alignments:
        nucmer_fpath = nucmer_base_fpath
    else:
        nucmer_fpath = nucmer_base_fpath + '.filtered'

    if not os.path.isfile(nucmer_fpath):
        logger.error('Nucmer\'s coords file (' + nucmer_fpath + ') not found! Try to restart QUAST.',
            indent='  ')
        return None

    coordfile = open(nucmer_fpath, 'r')
    for line in coordfile:
        if line.startswith('='):
            break

    # EXAMPLE:
    #    [S1]     [E1]  |     [S2]     [E2]  |  [LEN 1]  [LEN 2]  |  [% IDY]  | [TAGS]
    #=====================================================================================
    #  338980   339138  |     2298     2134  |      159      165  |    79.76  | gi|48994873|gb|U00096.2|	NODE_0_length_6088
    #  374145   374355  |     2306     2097  |      211      210  |    85.45  | gi|48994873|gb|U00096.2|	NODE_0_length_6088

    genome_mapping = {}
    for chr_name, chr_len in reference_chromosomes.items():
        genome_mapping[chr_name] = [0] * (chr_len + 1)

    contig_tuples = fastaparser.read_fasta(contigs_fpath)  # list of FASTA entries (in tuples: name, seq)
    contig_tuples = sorted(contig_tuples, key=lambda contig: len(contig[1]), reverse=True)
    sorted_contigs_names = [name for (name, seq) in contig_tuples]

    genes_in_contigs = [0] * len(sorted_contigs_names) # for cumulative plots: i-th element is the number of genes in i-th contig
    operons_in_contigs = [0] * len(sorted_contigs_names)
    aligned_blocks_by_contig_name = {} # for gene finding: contig_name --> list of AlignedBlock

    gene_searching_enabled = len(genes_container.region_list) or len(operons_container.region_list)
    if qconfig.memory_efficient and gene_searching_enabled:
        logger.warning('Run QUAST without genes and operons files to reduce memory consumption.')
    if gene_searching_enabled:
        for name in sorted_contigs_names:
            aligned_blocks_by_contig_name[name] = []
    for line in coordfile:
        if line.strip() == '':
            break
        s1 = int(line.split('|')[0].split()[0])
        e1 = int(line.split('|')[0].split()[1])
        s2 = int(line.split('|')[1].split()[0])
        e2 = int(line.split('|')[1].split()[1])
        contig_name = line.split()[12].strip()
        chr_name = line.split()[11].strip()

        if chr_name not in genome_mapping:
            logger.error("Something went wrong and chromosome names in your coords file (" + nucmer_base_fpath + ") " \
                         "differ from the names in the reference. Try to remove the file and restart QUAST.")
            return None

        if gene_searching_enabled:
            aligned_blocks_by_contig_name[contig_name].append(AlignedBlock(seqname=chr_name, start=s1, end=e1))
        if s2 == 0 and e2 == 0:  # special case: circular genome, contig starts on the end of a chromosome and ends in the beginning
            for i in range(s1, len(genome_mapping[chr_name])):
                genome_mapping[chr_name][i] = 1
            for i in range(1, e1 + 1):
                genome_mapping[chr_name][i] = 1
        else: #if s1 <= e1:
            for i in range(s1, e1 + 1):
                genome_mapping[chr_name][i] = 1
    coordfile.close()
    if qconfig.space_efficient and nucmer_fpath.endswith('.filtered'):
        os.remove(nucmer_fpath)

    # counting genome coverage and gaps number
    covered_bp = 0
    gaps_count = 0
    gaps_fpath = os.path.join(genome_stats_dirpath, corr_assembly_label + '_gaps.txt') if not qconfig.space_efficient else '/dev/null'
    gaps_file = open(gaps_fpath, 'w')
    for chr_name, chr_len in reference_chromosomes.items():
        gaps_file.write(chr_name + '\n')
        cur_gap_size = 0
        aligned_len = 0
        for i in range(1, chr_len + 1):
            if genome_mapping[chr_name][i] == 1:
                if cur_gap_size >= qconfig.min_gap_size:
                    gaps_count += 1
                    gaps_file.write(str(i - cur_gap_size) + ' ' + str(i - 1) + '\n')
                aligned_len += 1
                covered_bp += 1
                cur_gap_size = 0
            else:
                cur_gap_size += 1
        ref_lengths[chr_name] = aligned_len
        if cur_gap_size >= qconfig.min_gap_size:
            gaps_count += 1
            gaps_file.write(str(chr_len - cur_gap_size + 1) + ' ' + str(chr_len) + '\n')
    gaps_file.close()

    results["covered_bp"] = covered_bp
    results["gaps_count"] = gaps_count

    # finding genes and operons
    for container, feature_in_contigs, field, suffix in [
        (genes_container,
         genes_in_contigs,
         reporting.Fields.GENES,
         '_genes.txt'),

        (operons_container,
         operons_in_contigs,
         reporting.Fields.OPERONS,
         '_operons.txt')]:

        if not container.region_list:
            results[field + "_full"] = None
            results[field + "_partial"] = None
            continue

        total_full = 0
        total_partial = 0
        found_fpath = os.path.join(genome_stats_dirpath, corr_assembly_label + suffix)
        found_file = open(found_fpath, 'w')
        found_file.write('%s\t\t%s\t%s\t%s\n' % ('ID or #', 'Start', 'End', 'Type'))
        found_file.write('=========================================\n')

        # 0 - gene is not found,
        # 1 - gene is found,
        # 2 - part of gene is found
        found_list = [0] * len(container.region_list)
        for i, region in enumerate(container.region_list):
            found_list[i] = 0
            for contig_id, name in enumerate(sorted_contigs_names):
                cur_feature_is_found = False
                for cur_block in aligned_blocks_by_contig_name[name]:
                    if container.chr_names_dict[region.seqname] != cur_block.seqname:
                        continue

                    # computing circular genomes
                    if cur_block.start > cur_block.end:
                        blocks = [AlignedBlock(seqname=cur_block.seqname, start=cur_block.start, end=region.end + 1),
                                  AlignedBlock(seqname=cur_block.seqname, start=1, end=cur_block.end)]
                    else:
                        blocks = [cur_block]

                    for block in blocks:
                        if region.end <= block.start or block.end <= region.start:
                            continue
                        elif block.start <= region.start and region.end <= block.end:
                            if found_list[i] == 2:  # already found as partial gene
                                total_partial -= 1
                            found_list[i] = 1
                            total_full += 1
                            region_id = str(region.id)
                            if region_id == 'None':
                                region_id = '# ' + str(region.number + 1)
                            found_file.write('%s\t\t%d\t%d\tcomplete\n' % (region_id, region.start, region.end))
                            feature_in_contigs[contig_id] += 1  # inc number of found genes/operons in id-th contig

                            cur_feature_is_found = True
                            break
                        elif found_list[i] == 0 and min(region.end, block.end) - max(region.start, block.start) >= qconfig.min_gene_overlap:
                            found_list[i] = 2
                            total_partial += 1
                    if cur_feature_is_found:
                        break
                if cur_feature_is_found:
                    break
            # adding info about partially found genes/operons
            if found_list[i] == 2:  # partial gene/operon
                region_id = str(region.id)
                if region_id == 'None':
                    region_id = '# ' + str(region.number + 1)
                found_file.write('%s\t\t%d\t%d\tpartial\n' % (region_id, region.start, region.end))

        results[field + "_full"] = total_full
        results[field + "_partial"] = total_partial
        found_file.close()

    logger.info('  ' + qutils.index_to_str(index) + 'Analysis is finished.')

    return ref_lengths, (results, genes_in_contigs, operons_in_contigs)
Esempio n. 55
0
def do(ref_fpath, contigs_fpaths, output_dirpath, results_dir):
    logger.print_timestamp()
    logger.main_info("Running Basic statistics processor...")

    if not os.path.isdir(output_dirpath):
        os.mkdir(output_dirpath)

    reference_length = None
    reference_lengths = []
    reference_fragments = None
    if ref_fpath:
        reference_lengths = sorted(
            fastaparser.get_chr_lengths_from_fastafile(ref_fpath).values(),
            reverse=True)
        reference_fragments = len(reference_lengths)
        reference_length = sum(reference_lengths)
        reference_GC, reference_GC_distribution, reference_GC_contigs_distribution = GC_content(
            ref_fpath)

        logger.info('  Reference genome:')
        logger.info('    ' + os.path.basename(ref_fpath) + ', length = ' +
                    str(reference_length) + ', num fragments = ' +
                    str(reference_fragments) + ', GC % = ' + '%.2f' %
                    reference_GC if reference_GC is not None else 'undefined')
        if reference_fragments > 30 and not qconfig.check_for_fragmented_ref:
            logger.warning(
                '  Reference genome is fragmented. You may consider rerunning QUAST using --fragmented option.'
                ' QUAST will try to detect misassemblies caused by the fragmentation and mark them fake (will be excluded from # misassemblies).'
            )
    elif qconfig.estimated_reference_size:
        reference_length = qconfig.estimated_reference_size
        reference_lengths = [reference_length]
        logger.info('  Estimated reference length = ' + str(reference_length))

    logger.info('  Contig files: ')
    lists_of_lengths = []
    numbers_of_Ns = []
    coverage_dict = dict()
    cov_pattern = re.compile(r'_cov_(\d+\.?\d*)')
    for id, contigs_fpath in enumerate(contigs_fpaths):
        coverage_dict[contigs_fpath] = []
        assembly_label = qutils.label_from_fpath(contigs_fpath)

        logger.info('    ' + qutils.index_to_str(id) + assembly_label)
        # lists_of_lengths.append(fastaparser.get_lengths_from_fastafile(contigs_fpath))
        list_of_length = []
        number_of_Ns = 0
        is_potential_scaffold = False
        for (name, seq) in fastaparser.read_fasta(contigs_fpath):
            list_of_length.append(len(seq))
            number_of_Ns += seq.count('N')
            if not qconfig.scaffolds and not is_potential_scaffold and qutils.is_scaffold(
                    seq):
                is_potential_scaffold = True
                qconfig.potential_scaffolds_assemblies.append(assembly_label)
            if cov_pattern.findall(name):
                cov = int(float(cov_pattern.findall(name)[0]))
                if len(coverage_dict[contigs_fpath]) <= cov:
                    coverage_dict[contigs_fpath] += [0] * (
                        cov - len(coverage_dict[contigs_fpath]) + 1)
                coverage_dict[contigs_fpath][cov] += len(seq)

        lists_of_lengths.append(list_of_length)
        numbers_of_Ns.append(number_of_Ns)

    lists_of_lengths = [
        sorted(list, reverse=True) for list in lists_of_lengths
    ]
    num_contigs = max(
        [len(list_of_length) for list_of_length in lists_of_lengths])
    multiplicator = 1
    if num_contigs >= (qconfig.max_points * 2):
        import math
        multiplicator = int(num_contigs / qconfig.max_points)
        max_points = num_contigs // multiplicator
        corr_lists_of_lengths = [[
            sum(list_of_length[((i - 1) * multiplicator):(i * multiplicator)])
            for i in range(1, max_points)
            if (i * multiplicator) < len(list_of_length)
        ] for list_of_length in lists_of_lengths]
        if len(reference_lengths) > 1:
            reference_lengths = [
                sum(reference_lengths[(
                    (i - 1) * multiplicator):(i * multiplicator)]) if
                (i * multiplicator) < len(reference_lengths) else sum(
                    reference_lengths[((i - 1) * multiplicator):])
                for i in range(1, max_points)
            ] + [sum(reference_lengths[(max_points - 1) * multiplicator:])]
        for num_list in range(len(corr_lists_of_lengths)):
            last_index = len(corr_lists_of_lengths[num_list])
            corr_lists_of_lengths[num_list].append(
                sum(lists_of_lengths[num_list][last_index * multiplicator:]))
    else:
        corr_lists_of_lengths = [
            sorted(list, reverse=True) for list in lists_of_lengths
        ]

    if reference_lengths:
        # Saving for an HTML report
        if qconfig.html_report:
            from quast_libs.html_saver import html_saver
            html_saver.save_reference_lengths(results_dir, reference_lengths)

    if qconfig.html_report:
        from quast_libs.html_saver import html_saver
        html_saver.save_contigs_lengths(results_dir, contigs_fpaths,
                                        corr_lists_of_lengths)
        html_saver.save_tick_x(results_dir, multiplicator)

    ########################################################################

    logger.info('  Calculating N50 and L50...')

    list_of_GC_distributions = []
    list_of_GC_contigs_distributions = []
    largest_contig = 0
    from . import N50
    for id, (contigs_fpath, lengths_list, number_of_Ns) in enumerate(
            zip(contigs_fpaths, lists_of_lengths, numbers_of_Ns)):
        report = reporting.get(contigs_fpath)
        n50, l50 = N50.N50_and_L50(lengths_list)
        ng50, lg50 = None, None
        if reference_length:
            ng50, lg50 = N50.NG50_and_LG50(lengths_list, reference_length)
        n75, l75 = N50.N50_and_L50(lengths_list, 75)
        ng75, lg75 = None, None
        if reference_length:
            ng75, lg75 = N50.NG50_and_LG50(lengths_list, reference_length, 75)
        total_length = sum(lengths_list)
        total_GC, GC_distribution, GC_contigs_distribution = GC_content(
            contigs_fpath, skip=qconfig.no_gc)
        list_of_GC_distributions.append(GC_distribution)
        list_of_GC_contigs_distributions.append(GC_contigs_distribution)
        logger.info('    ' + qutils.index_to_str(id) +
                    qutils.label_from_fpath(contigs_fpath) + \
                    ', N50 = ' + str(n50) + \
                    ', L50 = ' + str(l50) + \
                    ', Total length = ' + str(total_length) + \
                    ', GC % = ' + ('%.2f' % total_GC if total_GC is not None else 'undefined') + \
                    ', # N\'s per 100 kbp = ' + ' %.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)) if total_length != 0 else 'undefined')

        report.add_field(reporting.Fields.N50, n50)
        report.add_field(reporting.Fields.L50, l50)
        if reference_length and not qconfig.is_combined_ref:
            report.add_field(reporting.Fields.NG50, ng50)
            report.add_field(reporting.Fields.LG50, lg50)
        report.add_field(reporting.Fields.N75, n75)
        report.add_field(reporting.Fields.L75, l75)
        if reference_length and not qconfig.is_combined_ref:
            report.add_field(reporting.Fields.NG75, ng75)
            report.add_field(reporting.Fields.LG75, lg75)
        report.add_field(reporting.Fields.CONTIGS, len(lengths_list))
        if lengths_list:
            report.add_field(reporting.Fields.LARGCONTIG, max(lengths_list))
            largest_contig = max(largest_contig, max(lengths_list))
            report.add_field(reporting.Fields.TOTALLEN, total_length)
            if not qconfig.is_combined_ref:
                report.add_field(
                    reporting.Fields.GC,
                    ('%.2f' % total_GC if total_GC is not None else None))
            report.add_field(reporting.Fields.UNCALLED, number_of_Ns)
            report.add_field(
                reporting.Fields.UNCALLED_PERCENT,
                ('%.2f' %
                 (float(number_of_Ns) * 100000.0 / float(total_length))))
        if ref_fpath:
            report.add_field(reporting.Fields.REFLEN, int(reference_length))
            report.add_field(reporting.Fields.REF_FRAGMENTS,
                             reference_fragments)
            if not qconfig.is_combined_ref:
                report.add_field(
                    reporting.Fields.REFGC,
                    ('%.2f' %
                     reference_GC if reference_GC is not None else None))
        elif reference_length:
            report.add_field(reporting.Fields.ESTREFLEN, int(reference_length))

    import math
    qconfig.min_difference = math.ceil(
        (largest_contig / 1000) / 600)  # divide on height of plot

    list_of_GC_distributions_with_ref = list_of_GC_distributions
    reference_index = None
    if ref_fpath:
        reference_index = len(list_of_GC_distributions_with_ref)
        list_of_GC_distributions_with_ref.append(reference_GC_distribution)

    if qconfig.html_report and not qconfig.no_gc:
        from quast_libs.html_saver import html_saver
        html_saver.save_GC_info(results_dir, contigs_fpaths,
                                list_of_GC_distributions_with_ref,
                                list_of_GC_contigs_distributions,
                                reference_index)

    ########################################################################
    # Drawing Nx and NGx plots...
    plotter.Nx_plot(results_dir, num_contigs > qconfig.max_points,
                    contigs_fpaths, lists_of_lengths,
                    join(output_dirpath, 'Nx_plot'), 'Nx', [])
    if reference_length and not qconfig.is_combined_ref:
        plotter.Nx_plot(results_dir, num_contigs > qconfig.max_points,
                        contigs_fpaths, lists_of_lengths,
                        join(output_dirpath, 'NGx_plot'), 'NGx',
                        [reference_length for i in range(len(contigs_fpaths))])

    if qconfig.draw_plots:
        ########################################################################import plotter
        # Drawing cumulative plot...
        plotter.cumulative_plot(ref_fpath, contigs_fpaths, lists_of_lengths,
                                join(output_dirpath, 'cumulative_plot'),
                                'Cumulative length')
        if not qconfig.no_gc:
            ########################################################################
            # Drawing GC content plot...
            plotter.GC_content_plot(ref_fpath, contigs_fpaths,
                                    list_of_GC_distributions_with_ref,
                                    join(output_dirpath, 'GC_content_plot'))
            for contigs_fpath, GC_distribution in zip(
                    contigs_fpaths, list_of_GC_contigs_distributions):
                plotter.contigs_GC_content_plot(
                    contigs_fpath, GC_distribution,
                    join(
                        output_dirpath,
                        qutils.label_from_fpath(contigs_fpath) +
                        '_GC_content_plot'))

        if any(coverage_dict[contigs_fpath]
               for contigs_fpath in contigs_fpaths):
            draw_coverage_histograms(coverage_dict, contigs_fpaths,
                                     output_dirpath)

    logger.main_info('Done.')
Esempio n. 56
0
def correct_meta_references(ref_fpaths, corrected_dirpath):
    corrected_ref_fpaths = []

    combined_ref_fpath = os.path.join(corrected_dirpath, qconfig.combined_ref_name)

    chromosomes_by_refs = {}

    def _proceed_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath):
        seq_fname = ref_name
        seq_fname += ref_fasta_ext

        if total_references > 1:
            corr_seq_fpath = corrected_ref_fpaths[-1]
        else:
            corr_seq_fpath = qutils.unique_corrected_fpath(os.path.join(corrected_dirpath, seq_fname))
            corrected_ref_fpaths.append(corr_seq_fpath)
        corr_seq_name = qutils.name_from_fpath(corr_seq_fpath) + '_' + seq_name
        if not qconfig.no_check:
            corr_seq = correct_seq(seq, ref_fpath)
            if not corr_seq:
                return None, None

        fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a')
        fastaparser.write_fasta(combined_ref_fpath, [(corr_seq_name, seq)], 'a')

        contigs_analyzer.ref_labels_by_chromosomes[corr_seq_name] = qutils.name_from_fpath(corr_seq_fpath)
        chromosomes_by_refs[ref_name].append((corr_seq_name, len(seq)))

        return corr_seq_name, corr_seq_fpath

    ref_fnames = [os.path.basename(ref_fpath) for ref_fpath in ref_fpaths]
    ref_names = []
    for ref_fname in ref_fnames:
        ref_name, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname)
        ref_names.append(ref_name)
    dupl_ref_names = [ref_name for ref_name in ref_names if ref_names.count(ref_name) > 1]

    for ref_fpath in ref_fpaths:
        total_references = 0
        ref_fname = os.path.basename(ref_fpath)
        ref_name, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname)
        if ref_name in dupl_ref_names:
            ref_name = qutils.get_label_from_par_dir_and_fname(ref_fpath)

        chromosomes_by_refs[ref_name] = []
        used_seq_names = defaultdict(int)

        corr_seq_fpath = None
        for i, (seq_name, seq) in enumerate(fastaparser.read_fasta(ref_fpath)):
            total_references += 1
            seq_name = correct_name(seq_name, qutils.MAX_CONTIG_NAME - len(ref_name) - 1)
            uniq_seq_name = get_uniq_name(seq_name, used_seq_names)
            used_seq_names[seq_name] += 1
            corr_seq_name, corr_seq_fpath = _proceed_seq(uniq_seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath)
            if not corr_seq_name:
                break
        if corr_seq_fpath:
            logger.main_info('  ' + ref_fpath + ' ==> ' + qutils.name_from_fpath(corr_seq_fpath) + '')

    logger.main_info('  All references combined in ' + qconfig.combined_ref_name)

    return corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_fpaths