Example #1
0
    def correct_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath):
        seq_fname = ref_name
        seq_fname += ref_fasta_ext

        if total_references > 1:
            corr_seq_fpath = corrected_ref_fpaths[-1]
        else:
            corr_seq_fpath = qutils.unique_corrected_fpath(os.path.join(corrected_dirpath, seq_fname))
            corrected_ref_fpaths.append(corr_seq_fpath)
        corr_seq_name = qutils.name_from_fpath(corr_seq_fpath)
        corr_seq_name += '_' + qutils.correct_name(seq_name[:20])
        if not qconfig.no_check:
            corr_seq = seq.upper()
            dic = {'M': 'N', 'K': 'N', 'R': 'N', 'Y': 'N', 'W': 'N', 'S': 'N', 'V': 'N', 'B': 'N', 'H': 'N', 'D': 'N'}
            pat = "(%s)" % "|".join(map(re.escape, dic.keys()))
            corr_seq = re.sub(pat, lambda m: dic[m.group()], corr_seq)
            if re.compile(r'[^ACGTN]').search(corr_seq):
                logger.warning('Skipping ' + ref_fpath + ' because it contains non-ACGTN characters.',
                        indent='    ')
                return None, None

        fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a')
        fastaparser.write_fasta(combined_ref_fpath, [(corr_seq_name, seq)], 'a')

        contigs_analyzer.ref_labels_by_chromosomes[corr_seq_name] = qutils.name_from_fpath(corr_seq_fpath)
        chromosomes_by_refs[ref_name].append((corr_seq_name, len(seq)))

        return corr_seq_name, corr_seq_fpath
Example #2
0
def _correct_contigs(contigs_fpaths, corrected_dirpath, min_contig, labels):
    assemblies = []

    for i, contigs_fpath in enumerate(contigs_fpaths):
        contigs_fname = os.path.basename(contigs_fpath)
        fname, ctg_fasta_ext = qutils.splitext_for_fasta_file(contigs_fname)

        label = labels[i]

        corr_fpath = qutils.unique_corrected_fpath(
            os.path.join(corrected_dirpath, label + ctg_fasta_ext))

        assembly = Assembly(corr_fpath, label)

        logger.info('  %s ==> %s' % (contigs_fpath, label))

        # Handle fasta
        lengths = fastaparser.get_lengths_from_fastafile(contigs_fpath)
        if not sum(l for l in lengths if l >= min_contig):
            logger.warning(
                "Skipping %s because it doesn't contain contigs >= %d bp." %
                (os.path.basename(contigs_fpath), min_contig))
            continue

        # correcting
        if not quast.correct_fasta(contigs_fpath, corr_fpath, min_contig):
            continue

        assemblies.append(assembly)

    return assemblies
Example #3
0
def _correct_contigs(contigs_fpaths, corrected_dirpath, min_contig, labels):
    assemblies = []

    for i, contigs_fpath in enumerate(contigs_fpaths):
        contigs_fname = os.path.basename(contigs_fpath)
        fname, ctg_fasta_ext = qutils.splitext_for_fasta_file(contigs_fname)

        label = labels[i]

        corr_fpath = qutils.unique_corrected_fpath(
            os.path.join(corrected_dirpath, label + ctg_fasta_ext))

        assembly = Assembly(corr_fpath, label)

        logger.info('  %s ==> %s' % (contigs_fpath, label))

        # Handle fasta
        lengths = fastaparser.get_lengths_from_fastafile(contigs_fpath)
        if not sum(l for l in lengths if l >= min_contig):
            logger.warning("Skipping %s because it doesn't contain contigs >= %d bp."
                           % (os.path.basename(contigs_fpath), min_contig))
            continue

        # correcting
        if not quast.correct_fasta(contigs_fpath, corr_fpath, min_contig):
            continue

        assemblies.append(assembly)

    return assemblies
Example #4
0
File: quast.py Project: ctb/quast
def _correct_reference(ref_fpath, corrected_dirpath):
    ref_fname = os.path.basename(ref_fpath)
    name, fasta_ext = qutils.splitext_for_fasta_file(ref_fname)
    corr_fpath = qutils.unique_corrected_fpath(
        os.path.join(corrected_dirpath, name + fasta_ext))
    if not correct_fasta(ref_fpath, corr_fpath, qconfig.min_contig, is_reference=True):
        ref_fpath = ''
    else:
        logger.main_info('  %s ==> %s' % (ref_fpath, qutils.name_from_fpath(corr_fpath)))
        ref_fpath = corr_fpath

    return ref_fpath
Example #5
0
def _correct_reference(ref_fpath, corrected_dirpath):
    ref_fname = os.path.basename(ref_fpath)
    name, fasta_ext = qutils.splitext_for_fasta_file(ref_fname)
    corr_fpath = qutils.unique_corrected_fpath(
        os.path.join(corrected_dirpath, name + fasta_ext))

    if not correct_fasta(ref_fpath, corr_fpath, qconfig.min_contig, is_reference=True):
        ref_fpath = ''
    else:
        logger.info('  %s ==> %s' % (ref_fpath, qutils.name_from_fpath(corr_fpath)))
        ref_fpath = corr_fpath

    return ref_fpath
Example #6
0
def _correct_contigs(contigs_fpaths, output_dirpath, labels):
    assemblies = [Assembly(contigs_fpaths[i], labels[i]) for i in range(len(contigs_fpaths))]
    corr_assemblies = []
    for file_counter, (contigs_fpath, label) in enumerate(zip(contigs_fpaths, labels)):
        contigs_fname = os.path.basename(contigs_fpath)
        fname, ctg_fasta_ext = qutils.splitext_for_fasta_file(contigs_fname)

        corr_fpath = qutils.unique_corrected_fpath(
            os.path.join(output_dirpath, qconfig.corrected_dirname, label + ctg_fasta_ext))

        corr_assemblies.append(Assembly(corr_fpath, label))
        logger.main_info('  ' + qutils.index_to_str(file_counter, force=(len(labels) > 1)) + '%s ==> %s' % (contigs_fpath, label))

    return assemblies, corr_assemblies
Example #7
0
    def correct_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references):
        seq_fname = ref_name
        if total_references > 1:
            seq_fname += '_' + qutils.correct_name(seq_name[:20])
        seq_fname += ref_fasta_ext

        corr_seq_fpath = qutils.unique_corrected_fpath(os.path.join(corrected_dirpath, seq_fname))
        corr_seq_name = qutils.name_from_fpath(corr_seq_fpath)

        corrected_ref_fpaths.append(corr_seq_fpath)

        fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a')
        fastaparser.write_fasta(combined_ref_fpath, [(corr_seq_name, seq)], 'a')

        return corr_seq_name
Example #8
0
    def correct_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references):
        seq_fname = ref_name
        if total_references > 1:
            seq_fname += '_' + qutils.correct_name(seq_name[:20])
        seq_fname += ref_fasta_ext

        corr_seq_fpath = qutils.unique_corrected_fpath(
            os.path.join(corrected_dirpath, seq_fname))
        corr_seq_name = qutils.name_from_fpath(corr_seq_fpath)

        corrected_ref_fpaths.append(corr_seq_fpath)

        fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a')
        fastaparser.write_fasta(combined_ref_fpath, [(corr_seq_name, seq)],
                                'a')

        return corr_seq_name
Example #9
0
    def correct_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references,
                    ref_fpath):
        seq_fname = ref_name
        seq_fname += ref_fasta_ext

        if total_references > 1:
            corr_seq_fpath = corrected_ref_fpaths[-1]
        else:
            corr_seq_fpath = qutils.unique_corrected_fpath(
                os.path.join(corrected_dirpath, seq_fname))
            corrected_ref_fpaths.append(corr_seq_fpath)
        corr_seq_name = qutils.name_from_fpath(corr_seq_fpath)
        corr_seq_name += '_' + qutils.correct_name(seq_name[:20])
        if not qconfig.no_check:
            corr_seq = seq.upper()
            dic = {
                'M': 'N',
                'K': 'N',
                'R': 'N',
                'Y': 'N',
                'W': 'N',
                'S': 'N',
                'V': 'N',
                'B': 'N',
                'H': 'N',
                'D': 'N'
            }
            pat = "(%s)" % "|".join(map(re.escape, dic.keys()))
            corr_seq = re.sub(pat, lambda m: dic[m.group()], corr_seq)
            if re.compile(r'[^ACGTN]').search(corr_seq):
                logger.warning('Skipping ' + ref_fpath +
                               ' because it contains non-ACGTN characters.',
                               indent='    ')
                return None, None

        fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a')
        fastaparser.write_fasta(combined_ref_fpath, [(corr_seq_name, seq)],
                                'a')

        contigs_analyzer.ref_labels_by_chromosomes[
            corr_seq_name] = qutils.name_from_fpath(corr_seq_fpath)
        chromosomes_by_refs[ref_name].append((corr_seq_name, len(seq)))

        return corr_seq_name, corr_seq_fpath
Example #10
0
def _correct_contigs(contigs_fpaths, output_dirpath, labels):
    assemblies = [
        Assembly(contigs_fpaths[i], labels[i])
        for i in range(len(contigs_fpaths))
    ]
    corr_assemblies = []
    for file_counter, (contigs_fpath,
                       label) in enumerate(zip(contigs_fpaths, labels)):
        contigs_fname = os.path.basename(contigs_fpath)
        fname, ctg_fasta_ext = qutils.splitext_for_fasta_file(contigs_fname)

        corr_fpath = qutils.unique_corrected_fpath(
            os.path.join(output_dirpath, qconfig.corrected_dirname,
                         label + ctg_fasta_ext))

        corr_assemblies.append(Assembly(corr_fpath, label))
        logger.main_info(
            '  ' + qutils.index_to_str(file_counter, force=(len(labels) > 1)) +
            '%s ==> %s' % (contigs_fpath, label))

    return assemblies, corr_assemblies
Example #11
0
def _correct_contigs(contigs_fpaths, corrected_dirpath, reporting, labels):
    ## removing from contigs' names special characters because:
    ## 1) Some embedded tools can fail on some strings with "...", "+", "-", etc
    ## 2) Nucmer fails on names like "contig 1_bla_bla", "contig 2_bla_bla" (it interprets as a contig's name only the first word of caption and gets ambiguous contigs names)
    corrected_contigs_fpaths = []

    for i, contigs_fpath in enumerate(contigs_fpaths):
        contigs_fname = os.path.basename(contigs_fpath)
        fname, fasta_ext = qutils.splitext_for_fasta_file(contigs_fname)

        label = labels[i]
        corr_fpath = qutils.unique_corrected_fpath(os.path.join(corrected_dirpath, label + fasta_ext))
        qconfig.assembly_labels_by_fpath[corr_fpath] = label
        logger.info('  %s ==> %s' % (contigs_fpath, label))

        # if option --scaffolds is specified QUAST adds splitted version of assemblies to the comparison
        if qconfig.scaffolds:
            logger.info("  breaking scaffolds into contigs:")
            corr_fpath_wo_ext = os.path.join(corrected_dirpath, qutils.name_from_fpath(corr_fpath))
            broken_scaffolds_fpath = corr_fpath_wo_ext + '_broken' + fasta_ext
            broken_scaffolds_fasta = []
            contigs_counter = 0

            for i, (name, seq) in enumerate(fastaparser.read_fasta(contigs_fpath)):
                i = 0
                cur_contig_number = 1
                cur_contig_start = 0
                while (i < len(seq)) and (seq.find("N", i) != -1):
                    start = seq.find("N", i)
                    end = start + 1
                    while (end != len(seq)) and (seq[end] == 'N'):
                        end += 1

                    i = end + 1
                    if (end - start) >= qconfig.Ns_break_threshold:
                        broken_scaffolds_fasta.append(
                            (name.split()[0] + "_" +
                             str(cur_contig_number),
                             seq[cur_contig_start:start]))
                        cur_contig_number += 1
                        cur_contig_start = end

                broken_scaffolds_fasta.append(
                    (name.split()[0] + "_" +
                     str(cur_contig_number),
                     seq[cur_contig_start:]))

                contigs_counter += cur_contig_number

            fastaparser.write_fasta(broken_scaffolds_fpath, broken_scaffolds_fasta)
            qconfig.assembly_labels_by_fpath[broken_scaffolds_fpath] = label + ' broken'
            logger.info("      %d scaffolds (%s) were broken into %d contigs (%s)" %
                        (i + 1,
                         qutils.name_from_fpath(corr_fpath),
                         contigs_counter,
                         qutils.name_from_fpath(broken_scaffolds_fpath)))

            if _handle_fasta(broken_scaffolds_fpath, broken_scaffolds_fpath, reporting):
                corrected_contigs_fpaths.append(broken_scaffolds_fpath)
                qconfig.list_of_broken_scaffolds.append(qutils.name_from_fpath(broken_scaffolds_fpath))

        if _handle_fasta(contigs_fpath, corr_fpath, reporting):
            corrected_contigs_fpaths.append(corr_fpath)

    return corrected_contigs_fpaths
Example #12
0
File: quast.py Project: ctb/quast
def _parallel_correct_contigs(file_counter, contigs_fpath, corrected_dirpath, labels):

    broken_scaffolds = None
    contigs_fname = os.path.basename(contigs_fpath)
    fname, fasta_ext = qutils.splitext_for_fasta_file(contigs_fname)

    label = labels[file_counter]
    corr_fpath = qutils.unique_corrected_fpath(os.path.join(corrected_dirpath, label + fasta_ext))
    logs = []
    logs.append('  ' + qutils.index_to_str(file_counter, force=(len(labels) > 1)) + '%s ==> %s' % (contigs_fpath, label))

    # if option --scaffolds is specified QUAST adds split version of assemblies to the comparison
    if qconfig.scaffolds:
        logger.info('  ' + qutils.index_to_str(file_counter, force=(len(labels) > 1)) + '  breaking scaffolds into contigs:')
        corr_fpath_wo_ext = os.path.join(corrected_dirpath, qutils.name_from_fpath(corr_fpath))
        broken_scaffolds_fpath = corr_fpath_wo_ext + '_broken' + fasta_ext
        broken_scaffolds_fasta = []
        contigs_counter = 0

        scaffold_counter = 0
        for scaffold_counter, (name, seq) in enumerate(fastaparser.read_fasta(contigs_fpath)):
            if contigs_counter % 100 == 0:
                pass
            if contigs_counter > 520:
                pass
            cumul_contig_length = 0
            total_contigs_for_the_scaf = 1
            cur_contig_start = 0
            while (cumul_contig_length < len(seq)) and (seq.find('N', cumul_contig_length) != -1):
                start = seq.find("N", cumul_contig_length)
                end = start + 1
                while (end != len(seq)) and (seq[end] == 'N'):
                    end += 1

                cumul_contig_length = end + 1
                if (end - start) >= qconfig.Ns_break_threshold:
                    broken_scaffolds_fasta.append(
                        (name.split()[0] + "_" +
                         str(total_contigs_for_the_scaf),
                         seq[cur_contig_start:start]))
                    total_contigs_for_the_scaf += 1
                    cur_contig_start = end

            broken_scaffolds_fasta.append(
                (name.split()[0] + "_" +
                 str(total_contigs_for_the_scaf),
                 seq[cur_contig_start:]))

            contigs_counter += total_contigs_for_the_scaf
        if scaffold_counter + 1 != contigs_counter:
            fastaparser.write_fasta(broken_scaffolds_fpath, broken_scaffolds_fasta)
            logs.append("  " + qutils.index_to_str(file_counter, force=(len(labels) > 1)) +
                        "    %d scaffolds (%s) were broken into %d contigs (%s)" %
                        (scaffold_counter + 1,
                         label,
                         contigs_counter,
                         label + ' broken'))
            broken_scaffolds = (broken_scaffolds_fpath, broken_scaffolds_fpath)
        else:
            logs.append("  " + qutils.index_to_str(file_counter, force=(len(labels) > 1)) +
                    "    WARNING: nothing was broken, skipping '%s broken' from further analysis" % label)

    corr_fpaths = (contigs_fpath, corr_fpath)
    return corr_fpaths, broken_scaffolds, logs
Example #13
0
def _correct_contigs(contigs_fpaths, corrected_dirpath, reporting, labels):
    ## removing from contigs' names special characters because:
    ## 1) Some embedded tools can fail on some strings with "...", "+", "-", etc
    ## 2) Nucmer fails on names like "contig 1_bla_bla", "contig 2_bla_bla" (it interprets as a contig's name only the first word of caption and gets ambiguous contigs names)
    corrected_contigs_fpaths = []

    for i, contigs_fpath in enumerate(contigs_fpaths):
        contigs_fname = os.path.basename(contigs_fpath)
        fname, fasta_ext = qutils.splitext_for_fasta_file(contigs_fname)

        label = labels[i]
        corr_fpath = qutils.unique_corrected_fpath(os.path.join(corrected_dirpath, label + fasta_ext))
        qconfig.assembly_labels_by_fpath[corr_fpath] = label
        logger.info('  %s ==> %s' % (contigs_fpath, label))

        # if option --scaffolds is specified QUAST adds splitted version of assemblies to the comparison
        if qconfig.scaffolds:
            logger.info("  breaking scaffolds into contigs:")
            corr_fpath_wo_ext = os.path.join(corrected_dirpath, qutils.name_from_fpath(corr_fpath))
            broken_scaffolds_fpath = corr_fpath_wo_ext + '_broken' + fasta_ext
            broken_scaffolds_fasta = []
            contigs_counter = 0

            for i, (name, seq) in enumerate(fastaparser.read_fasta(contigs_fpath)):
                i = 0
                cur_contig_number = 1
                cur_contig_start = 0
                while (i < len(seq)) and (seq.find("N", i) != -1):
                    start = seq.find("N", i)
                    end = start + 1
                    while (end != len(seq)) and (seq[end] == 'N'):
                        end += 1

                    i = end + 1
                    if (end - start) >= qconfig.Ns_break_threshold:
                        broken_scaffolds_fasta.append(
                            (name.split()[0] + "_" +
                             str(cur_contig_number),
                             seq[cur_contig_start:start]))
                        cur_contig_number += 1
                        cur_contig_start = end

                broken_scaffolds_fasta.append(
                    (name.split()[0] + "_" +
                     str(cur_contig_number),
                     seq[cur_contig_start:]))

                contigs_counter += cur_contig_number

            fastaparser.write_fasta(broken_scaffolds_fpath, broken_scaffolds_fasta)
            qconfig.assembly_labels_by_fpath[broken_scaffolds_fpath] = label + ' broken'
            logger.info("      %d scaffolds (%s) were broken into %d contigs (%s)" %
                        (i + 1,
                         qutils.name_from_fpath(corr_fpath),
                         contigs_counter,
                         qutils.name_from_fpath(broken_scaffolds_fpath)))

            if _handle_fasta(broken_scaffolds_fpath, broken_scaffolds_fpath, reporting):
                corrected_contigs_fpaths.append(broken_scaffolds_fpath)
                qconfig.list_of_broken_scaffolds.append(qutils.name_from_fpath(broken_scaffolds_fpath))

        if _handle_fasta(contigs_fpath, corr_fpath, reporting):
            corrected_contigs_fpaths.append(corr_fpath)

    return corrected_contigs_fpaths
Example #14
0
def _parallel_correct_contigs(file_counter, contigs_fpath, corrected_dirpath,
                              labels):

    broken_scaffolds = None
    contigs_fname = os.path.basename(contigs_fpath)
    fname, fasta_ext = qutils.splitext_for_fasta_file(contigs_fname)

    label = labels[file_counter]
    corr_fpath = qutils.unique_corrected_fpath(
        os.path.join(corrected_dirpath, label + fasta_ext))
    logs = []
    logs.append('  ' +
                qutils.index_to_str(file_counter, force=(len(labels) > 1)) +
                '%s ==> %s' % (contigs_fpath, label))

    # if option --scaffolds is specified QUAST adds split version of assemblies to the comparison
    if qconfig.scaffolds:
        logger.info(
            '  ' + qutils.index_to_str(file_counter, force=(len(labels) > 1)) +
            '  breaking scaffolds into contigs:')
        corr_fpath_wo_ext = os.path.join(corrected_dirpath,
                                         qutils.name_from_fpath(corr_fpath))
        broken_scaffolds_fpath = corr_fpath_wo_ext + '_broken' + fasta_ext
        broken_scaffolds_fasta = []
        contigs_counter = 0

        scaffold_counter = 0
        for scaffold_counter, (name, seq) in enumerate(
                fastaparser.read_fasta(contigs_fpath)):
            if contigs_counter % 100 == 0:
                pass
            if contigs_counter > 520:
                pass
            cumul_contig_length = 0
            total_contigs_for_the_scaf = 1
            cur_contig_start = 0
            while (cumul_contig_length < len(seq)) and (seq.find(
                    'N', cumul_contig_length) != -1):
                start = seq.find("N", cumul_contig_length)
                end = start + 1
                while (end != len(seq)) and (seq[end] == 'N'):
                    end += 1

                cumul_contig_length = end + 1
                if (end - start) >= qconfig.Ns_break_threshold:
                    broken_scaffolds_fasta.append(
                        (name.split()[0] + "_" +
                         str(total_contigs_for_the_scaf),
                         seq[cur_contig_start:start]))
                    total_contigs_for_the_scaf += 1
                    cur_contig_start = end

            broken_scaffolds_fasta.append(
                (name.split()[0] + "_" + str(total_contigs_for_the_scaf),
                 seq[cur_contig_start:]))

            contigs_counter += total_contigs_for_the_scaf
        if scaffold_counter + 1 != contigs_counter:
            fastaparser.write_fasta(broken_scaffolds_fpath,
                                    broken_scaffolds_fasta)
            logs.append(
                "  " +
                qutils.index_to_str(file_counter, force=(len(labels) > 1)) +
                "    %d scaffolds (%s) were broken into %d contigs (%s)" %
                (scaffold_counter + 1, label, contigs_counter,
                 label + ' broken'))
            broken_scaffolds = (broken_scaffolds_fpath, broken_scaffolds_fpath)
        else:
            logs.append(
                "  " +
                qutils.index_to_str(file_counter, force=(len(labels) > 1)) +
                "    WARNING: nothing was broken, skipping '%s broken' from further analysis"
                % label)

    corr_fpaths = (contigs_fpath, corr_fpath)
    return corr_fpaths, broken_scaffolds, logs