Esempio n. 1
0
def predict_genes(index, contigs_fpath, gene_lengths, out_dirpath,
                  tool_dirpath, tmp_dirpath):
    assembly_name = qutils.name_from_fpath(contigs_fpath)
    assembly_label = qutils.label_from_fpath(contigs_fpath)

    logger.info('  ' + qutils.index_to_str(index) + assembly_label)

    out_fpath = os.path.join(out_dirpath, assembly_name + '_glimmer')
    err_fpath = os.path.join(out_dirpath, assembly_name + '_glimmer.stderr')

    #out_gff_path, out_fasta_path, unique, total, cnt = glimmerHMM(tool_dir,
    #    fasta_path, out_path, gene_lengths, err_path)

    out_gff_path, unique, total, cnt = glimmerHMM(tool_dirpath, contigs_fpath,
                                                  out_fpath, gene_lengths,
                                                  err_fpath, tmp_dirpath,
                                                  index)

    if out_gff_path:
        logger.info('  ' + qutils.index_to_str(index) + '  Genes = ' +
                    str(unique) + ' unique, ' + str(total) + ' total')
        logger.info('  ' + qutils.index_to_str(index) +
                    '  Predicted genes (GFF): ' + out_gff_path)

    return unique, cnt
Esempio n. 2
0
def run_gage(i, contigs_fpath, gage_results_dirpath, gage_tool_path, reference, tmp_dir):
    assembly_name = qutils.name_from_fpath(contigs_fpath)
    assembly_label = qutils.label_from_fpath(contigs_fpath)

    logger.info('  ' + qutils.index_to_str(i) + assembly_label + '...')

    # run gage tool
    log_out_fpath = os.path.join(gage_results_dirpath, 'gage_' + assembly_name + '.stdout')
    log_err_fpath = os.path.join(gage_results_dirpath, 'gage_' + assembly_name + '.stderr')
    logger.info('  ' + qutils.index_to_str(i) + 'Logging to files ' +
                os.path.basename(log_out_fpath) + ' and ' +
                os.path.basename(log_err_fpath) + '...')
    log_out_f = open(log_out_fpath, 'w')
    log_err_f = open(log_err_fpath, 'w')

    return_code = qutils.call_subprocess(
        ['sh', gage_tool_path, reference, contigs_fpath, tmp_dir, str(qconfig.min_contig)],
        stdout=log_out_f,
        stderr=log_err_f,
        indent='  ' + qutils.index_to_str(i),
        only_if_debug=False)
    if return_code != 0:
        logger.info('  ' + qutils.index_to_str(i) + 'Failed.')
    else:
        logger.info('  ' + qutils.index_to_str(i) + 'Done.')

    log_out_f.close()
    log_err_f.close()

    return return_code
Esempio n. 3
0
def predict_genes(index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath, gmhmm_p_function,
                  prokaryote, num_threads):
    assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath)

    logger.info('  ' + qutils.index_to_str(index) + assembly_label)

    err_fpath = os.path.join(out_dirpath, assembly_label + '_genemark.stderr')

    genes = gmhmm_p_function(tool_dirpath, contigs_fpath, err_fpath, index, tmp_dirpath, num_threads)

    if not genes:
        unique_count = None
        count = None  # [None] * len(gene_lengths)
    else:
        tool_name = "genemark"
        out_gff_fpath = os.path.join(out_dirpath, assembly_label + '_' + tool_name + '_genes.gff')
        add_genes_to_gff(genes, out_gff_fpath, prokaryote)
        if OUTPUT_FASTA:
            out_fasta_fpath = os.path.join(out_dirpath, assembly_label + '_' + tool_name + '_genes.fasta')
            add_genes_to_fasta(genes, out_fasta_fpath)

        count = [sum([gene[3] - gene[2] > x for gene in genes]) for x in gene_lengths]
        unique_count = len(set([gene[4] for gene in genes]))
        total_count = len(genes)

        logger.info('  ' + qutils.index_to_str(index) + '  Genes = ' + str(unique_count) + ' unique, ' + str(total_count) + ' total')
        logger.info('  ' + qutils.index_to_str(index) + '  Predicted genes (GFF): ' + out_gff_fpath)

    return unique_count, count
Esempio n. 4
0
def run_gage(i, contigs_fpath, gage_results_dirpath, gage_tool_path, reference, tmp_dir):
    assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath)

    logger.info('  ' + qutils.index_to_str(i) + assembly_label + '...')

    # run gage tool
    log_out_fpath = os.path.join(gage_results_dirpath, 'gage_' + assembly_label + '.stdout')
    log_err_fpath = os.path.join(gage_results_dirpath, 'gage_' + assembly_label + '.stderr')
    logger.info('  ' + qutils.index_to_str(i) + 'Logging to files ' +
                os.path.basename(log_out_fpath) + ' and ' +
                os.path.basename(log_err_fpath) + '...')
    log_out_f = open(log_out_fpath, 'w')
    log_err_f = open(log_err_fpath, 'w')

    return_code = qutils.call_subprocess(
        ['sh', gage_tool_path, reference, contigs_fpath, tmp_dir, str(qconfig.min_contig)],
        stdout=log_out_f,
        stderr=log_err_f,
        indent='  ' + qutils.index_to_str(i),
        only_if_debug=False)
    if return_code != 0:
        logger.info('  ' + qutils.index_to_str(i) + 'Failed.')
    else:
        logger.info('  ' + qutils.index_to_str(i) + 'Done.')

    log_out_f.close()
    log_err_f.close()

    return return_code
Esempio n. 5
0
def predict_genes(index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath, gmhmm_p_function):
    assembly_name = qutils.name_from_fpath(contigs_fpath)
    assembly_label = qutils.label_from_fpath(contigs_fpath)

    logger.info('  ' + qutils.index_to_str(index) + assembly_label)

    err_fpath = os.path.join(out_dirpath, assembly_name + '_genemark.stderr')

    genes = gmhmm_p_function(tool_dirpath, contigs_fpath, err_fpath, index, tmp_dirpath)

    if not genes:
        unique_count = None
        count = None  # [None] * len(gene_lengths)
    else:
        tool_name = "genemark"
        out_gff_fpath = os.path.join(out_dirpath, assembly_name + '_' + tool_name + '_genes.gff')
        add_genes_to_gff(genes, out_gff_fpath)
        if OUTPUT_FASTA:
            out_fasta_fpath = os.path.join(out_dirpath, assembly_name + '_' + tool_name + '_genes.fasta')
            add_genes_to_fasta(genes, out_fasta_fpath)

        count = [sum([gene[3] - gene[2] > x for gene in genes]) for x in gene_lengths]
        unique_count = len(set([gene[4] for gene in genes]))
        total_count = len(genes)

        logger.info('  ' + qutils.index_to_str(index) + '  Genes = ' + str(unique_count) + ' unique, ' + str(total_count) + ' total')
        logger.info('  ' + qutils.index_to_str(index) + '  Predicted genes (GFF): ' + out_gff_fpath)

    return unique_count, count
Esempio n. 6
0
def gmhmm_p_everyGC(tool_dirpath, fasta_fpath, err_fpath, index, tmp_dirpath,
                    num_threads):
    tmp_dirpath = tempfile.mkdtemp(dir=tmp_dirpath)

    tool_exec_fpath = os.path.join(tool_dirpath, 'gmsn.pl')
    err_file = open(err_fpath, 'w')
    fasta_name = qutils.name_from_fpath(fasta_fpath)
    return_code = qutils.call_subprocess([
        'perl', tool_exec_fpath, '--name', fasta_name, '--clean', '--out',
        tmp_dirpath, fasta_fpath
    ],
                                         stdout=err_file,
                                         stderr=err_file,
                                         indent='    ' +
                                         qutils.index_to_str(index))
    if return_code != 0:
        return

    genes = []
    tool_exec_fpath = os.path.join(tool_dirpath, 'gmhmmp')
    sub_fasta_fpath = os.path.join(tmp_dirpath, fasta_name)
    out_fpath = sub_fasta_fpath + '.gmhmm'
    heu_fpath = os.path.join(tmp_dirpath, fasta_name + '_hmm_heuristic.mod')
    with open(err_fpath, 'a') as err_file:
        ok = gmhmm_p(tool_exec_fpath, fasta_fpath, heu_fpath, out_fpath,
                     err_file, index)
        if ok:
            genes.extend(parse_gmhmm_out(out_fpath))

    if not qconfig.debug:
        shutil.rmtree(tmp_dirpath)

    return genes
Esempio n. 7
0
def do(fasta_fpaths, gene_lengths, out_dirpath, prokaryote, meta):
    logger.print_timestamp()
    if LICENSE_LIMITATIONS_MODE:
        logger.warning("GeneMark tool can't be started because of license limitations!")
        return

    if meta:
        tool_name = 'MetaGeneMark'
        tool_dirname = 'genemark'
        gmhmm_p_function = gmhmm_p_metagenomic
    elif prokaryote:
        tool_name = 'GeneMarkS'
        tool_dirname = 'genemark'
        gmhmm_p_function = gmhmm_p_everyGC
    else:
        tool_name = 'GeneMark-ES'
        tool_dirname = 'genemark-es'
        gmhmm_p_function = gm_es

    logger.main_info('Running %s...' % tool_name)

    tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, tool_dirname, qconfig.platform_name)
    if not os.path.exists(tool_dirpath):
        logger.warning('  Sorry, can\'t use %s on this platform, skipping gene prediction.' % tool_name)
    else:
        successful = install_genemark(os.path.join(qconfig.LIBS_LOCATION, 'genemark', qconfig.platform_name))
        if not successful:
            return

        if not os.path.isdir(out_dirpath):
            os.mkdir(out_dirpath)
        tmp_dirpath = os.path.join(out_dirpath, 'tmp')
        if not os.path.isdir(tmp_dirpath):
            os.mkdir(tmp_dirpath)

        n_jobs = min(len(fasta_fpaths), qconfig.max_threads)
        num_threads = max(1, qconfig.max_threads // n_jobs)
        from joblib import Parallel, delayed
        results = Parallel(n_jobs=n_jobs)(delayed(predict_genes)(
            index, fasta_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath, gmhmm_p_function, prokaryote, num_threads)
            for index, fasta_fpath in enumerate(fasta_fpaths))

        # saving results
        for i, fasta_path in enumerate(fasta_fpaths):
            report = reporting.get(fasta_path)
            unique_count, count = results[i]
            if unique_count is not None:
                report.add_field(reporting.Fields.PREDICTED_GENES_UNIQUE, unique_count)
            if count is not None:
                report.add_field(reporting.Fields.PREDICTED_GENES, count)
            if unique_count is None and count is None:
                logger.error('  ' + qutils.index_to_str(i) +
                     'Failed predicting genes in ' + qutils.label_from_fpath(fasta_path) + '. ' +
                     ('File may be too small for GeneMark-ES. Try to use GeneMarkS instead (remove --eukaryote option).'
                         if tool_name == 'GeneMark-ES' and os.path.getsize(fasta_path) < 2000000 else ''))

        if not qconfig.debug:
            shutil.rmtree(tmp_dirpath)

        logger.main_info('Done.')
Esempio n. 8
0
def gmhmm_p_everyGC(tool_dirpath, fasta_fpath, err_fpath, index, tmp_dirpath, num_threads):
    tmp_dirpath = tempfile.mkdtemp(dir=tmp_dirpath)

    tool_exec_fpath = os.path.join(tool_dirpath, 'gmsn.pl')
    err_file = open(err_fpath, 'w')
    fasta_name = qutils.name_from_fpath(fasta_fpath)
    return_code = qutils.call_subprocess(
        ['perl', tool_exec_fpath, '--name', fasta_name, '--clean', '--out', tmp_dirpath,
         fasta_fpath],
        stdout=err_file,
        stderr=err_file,
        indent='    ' + qutils.index_to_str(index))
    if return_code != 0:
        return

    genes = []
    tool_exec_fpath = os.path.join(tool_dirpath, 'gmhmmp')
    sub_fasta_fpath = os.path.join(tmp_dirpath, fasta_name)
    out_fpath = sub_fasta_fpath + '.gmhmm'
    heu_fpath = os.path.join(tmp_dirpath, fasta_name + '_hmm_heuristic.mod')
    with open(err_fpath, 'a') as err_file:
        ok = gmhmm_p(tool_exec_fpath, fasta_fpath, heu_fpath,
                   out_fpath, err_file, index)
        if ok:
            genes.extend(parse_gmhmm_out(out_fpath))

    if not qconfig.debug:
        shutil.rmtree(tmp_dirpath)

    return genes
Esempio n. 9
0
File: glimmer.py Progetto: ctb/quast
 def run(contig_path, tmp_path):
     with open(err_path, 'a') as err_file:
         return_code = qutils.call_subprocess(
             [tool_exec, contig_path, '-d', trained_dir, '-g', '-o', tmp_path],
             stdout=err_file,
             stderr=err_file,
             indent='  ' + qutils.index_to_str(index) + '  ')
         return return_code
Esempio n. 10
0
File: glimmer.py Progetto: ctb/quast
def predict_genes(index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath):
    assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath)

    logger.info('  ' + qutils.index_to_str(index) + assembly_label)

    out_fpath = os.path.join(out_dirpath, assembly_label + '_glimmer')
    err_fpath = os.path.join(out_dirpath, assembly_label + '_glimmer.stderr')

    #out_gff_path, out_fasta_path, unique, total, cnt = glimmerHMM(tool_dir,
    #    fasta_path, out_path, gene_lengths, err_path)

    out_gff_path, unique, total, cnt = glimmerHMM(tool_dirpath,
        contigs_fpath, out_fpath, gene_lengths, err_fpath, tmp_dirpath, index)

    if out_gff_path:
        logger.info('  ' + qutils.index_to_str(index) + '  Genes = ' + str(unique) + ' unique, ' + str(total) + ' total')
        logger.info('  ' + qutils.index_to_str(index) + '  Predicted genes (GFF): ' + out_gff_path)

    return unique, cnt
Esempio n. 11
0
 def run(contig_path, tmp_path):
     with open(err_path, 'a') as err_file:
         return_code = qutils.call_subprocess([
             tool_exec, contig_path, '-d', trained_dir, '-g', '-o', tmp_path
         ],
                                              stdout=err_file,
                                              stderr=err_file,
                                              indent='  ' +
                                              qutils.index_to_str(index) +
                                              '  ')
         return return_code
Esempio n. 12
0
def gmhmm_p(tool_exec, fasta_fpath, heu_fpath, out_fpath, err_file, index):
    """ Run GeneMark.hmm with this heuristic model (heu_dirpath)
        prompt> gmhmmp -m heu_11_45.mod sequence
        prompt> gm -m heu_11_45.mat sequence"""
    return_code = qutils.call_subprocess(
        [tool_exec, '-d', '-p', '0', '-m', heu_fpath, '-o', out_fpath, fasta_fpath],
        stdout=err_file,
        stderr=err_file,
        indent='    ' + qutils.index_to_str(index))

    return return_code == 0 and os.path.isfile(out_fpath)
Esempio n. 13
0
def gmhmm_p(tool_exec, fasta_fpath, heu_fpath, out_fpath, err_file, index):
    """ Run GeneMark.hmm with this heuristic model (heu_dirpath)
        prompt> gmhmmp -m heu_11_45.mod sequence
        prompt> gm -m heu_11_45.mat sequence"""
    return_code = qutils.call_subprocess(
        [tool_exec, '-d', '-p', '0', '-m', heu_fpath, '-o', out_fpath, fasta_fpath],
        stdout=err_file,
        stderr=err_file,
        indent='    ' + qutils.index_to_str(index))

    return return_code == 0 and os.path.isfile(out_fpath)
Esempio n. 14
0
def _correct_contigs(contigs_fpaths, output_dirpath, labels):
    assemblies = [Assembly(contigs_fpaths[i], labels[i]) for i in range(len(contigs_fpaths))]
    corr_assemblies = []
    for file_counter, (contigs_fpath, label) in enumerate(zip(contigs_fpaths, labels)):
        contigs_fname = os.path.basename(contigs_fpath)
        fname, ctg_fasta_ext = qutils.splitext_for_fasta_file(contigs_fname)

        corr_fpath = qutils.unique_corrected_fpath(
            os.path.join(output_dirpath, qconfig.corrected_dirname, label + ctg_fasta_ext))

        corr_assemblies.append(Assembly(corr_fpath, label))
        logger.main_info('  ' + qutils.index_to_str(file_counter, force=(len(labels) > 1)) + '%s ==> %s' % (contigs_fpath, label))

    return assemblies, corr_assemblies
Esempio n. 15
0
def _correct_contigs(contigs_fpaths, output_dirpath, labels):
    assemblies = [
        Assembly(contigs_fpaths[i], labels[i])
        for i in range(len(contigs_fpaths))
    ]
    corr_assemblies = []
    for file_counter, (contigs_fpath,
                       label) in enumerate(zip(contigs_fpaths, labels)):
        contigs_fname = os.path.basename(contigs_fpath)
        fname, ctg_fasta_ext = qutils.splitext_for_fasta_file(contigs_fname)

        corr_fpath = qutils.unique_corrected_fpath(
            os.path.join(output_dirpath, qconfig.corrected_dirname,
                         label + ctg_fasta_ext))

        corr_assemblies.append(Assembly(corr_fpath, label))
        logger.main_info(
            '  ' + qutils.index_to_str(file_counter, force=(len(labels) > 1)) +
            '%s ==> %s' % (contigs_fpath, label))

    return assemblies, corr_assemblies
Esempio n. 16
0
def gm_es(tool_dirpath, fasta_fpath, err_fpath, index, tmp_dirpath, num_threads):
    tool_exec_fpath = os.path.join(tool_dirpath, 'gmes_petap.pl')
    libs_dirpath = os.path.join(qconfig.LIBS_LOCATION, 'genemark-es', 'lib')
    err_file = open(err_fpath, 'w')
    tmp_dirpath += qutils.name_from_fpath(fasta_fpath)
    if not os.path.isdir(tmp_dirpath):
        os.mkdir(tmp_dirpath)
    return_code = qutils.call_subprocess(
        ['perl', '-I', libs_dirpath, tool_exec_fpath, '--ES', '--cores', str(num_threads), '--sequence', fasta_fpath,
         '--out', tmp_dirpath],
        stdout=err_file,
        stderr=err_file,
        indent='    ' + qutils.index_to_str(index))
    if return_code != 0:
        return
    genes = []
    _, _, fnames = os.walk(tmp_dirpath).next()
    for fname in fnames:
        if fname.endswith('gtf'):
            genes.extend(parse_gtf_out(os.path.join(tmp_dirpath, fname)))
    return genes
Esempio n. 17
0
def gm_es(tool_dirpath, fasta_fpath, err_fpath, index, tmp_dirpath,
          num_threads):
    tool_exec_fpath = os.path.join(tool_dirpath, 'gmes_petap.pl')
    libs_dirpath = os.path.join(qconfig.LIBS_LOCATION, 'genemark-es', 'lib')
    err_file = open(err_fpath, 'w')
    tmp_dirpath += qutils.name_from_fpath(fasta_fpath)
    if not os.path.isdir(tmp_dirpath):
        os.mkdir(tmp_dirpath)
    return_code = qutils.call_subprocess([
        'perl', '-I', libs_dirpath, tool_exec_fpath, '--ES', '--cores',
        str(num_threads), '--sequence', fasta_fpath, '--out', tmp_dirpath
    ],
                                         stdout=err_file,
                                         stderr=err_file,
                                         indent='    ' +
                                         qutils.index_to_str(index))
    if return_code != 0:
        return
    genes = []
    _, _, fnames = os.walk(tmp_dirpath).next()
    for fname in fnames:
        if fname.endswith('gtf'):
            genes.extend(parse_gtf_out(os.path.join(tmp_dirpath, fname)))
    return genes
Esempio n. 18
0
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath,
       aligned_lengths_lists, aligned_stats_dirpath):

    if not os.path.isdir(aligned_stats_dirpath):
        os.mkdir(aligned_stats_dirpath)

    ########################################################################
    report_dict = {'header': []}
    for contigs_fpath in aligned_contigs_fpaths:
        report_dict[qutils.name_from_fpath(contigs_fpath)] = []

    ########################################################################
    logger.print_timestamp()
    logger.info('Running NA-NGA calculation...')

    reference_length = sum(fastaparser.get_lengths_from_fastafile(ref_fpath))
    assembly_lengths = []
    for contigs_fpath in aligned_contigs_fpaths:
        assembly_lengths.append(sum(fastaparser.get_lengths_from_fastafile(contigs_fpath)))

    import N50
    for i, (contigs_fpath, lens, assembly_len) in enumerate(
            itertools.izip(aligned_contigs_fpaths, aligned_lengths_lists, assembly_lengths)):
        na50 = N50.NG50(lens, assembly_len)
        nga50 = N50.NG50(lens, reference_length)
        na75 = N50.NG50(lens, assembly_len, 75)
        nga75 = N50.NG50(lens, reference_length, 75)
        la50 = N50.LG50(lens, assembly_len)
        lga50 = N50.LG50(lens, reference_length)
        la75 = N50.LG50(lens, assembly_len, 75)
        lga75 = N50.LG50(lens, reference_length, 75)
        logger.info('  ' +
                    qutils.index_to_str(i) +
                    qutils.label_from_fpath(contigs_fpath) +
                 ', Largest alignment = ' + str(max(lens)) +
                 ', NA50 = ' + str(na50) +
                 ', NGA50 = ' + str(nga50) +
                 ', LA50 = ' + str(la50) +
                 ', LGA50 = ' + str(lga50))
        report = reporting.get(contigs_fpath)
        report.add_field(reporting.Fields.LARGALIGN, max(lens))
        report.add_field(reporting.Fields.NA50, na50)
        report.add_field(reporting.Fields.NGA50, nga50)
        report.add_field(reporting.Fields.NA75, na75)
        report.add_field(reporting.Fields.NGA75, nga75)
        report.add_field(reporting.Fields.LA50, la50)
        report.add_field(reporting.Fields.LGA50, lga50)
        report.add_field(reporting.Fields.LA75, la75)
        report.add_field(reporting.Fields.LGA75, lga75)

    ########################################################################
    # saving to JSON
    if json_output_dirpath:
        from libs.html_saver import json_saver
        json_saver.save_aligned_contigs_lengths(json_output_dirpath, aligned_contigs_fpaths, aligned_lengths_lists)
        json_saver.save_assembly_lengths(json_output_dirpath, aligned_contigs_fpaths, assembly_lengths)

    # saving to html
    if qconfig.html_report:
        from libs.html_saver import html_saver
        html_saver.save_aligned_contigs_lengths(output_dirpath, aligned_contigs_fpaths, aligned_lengths_lists)
        html_saver.save_assembly_lengths(output_dirpath, aligned_contigs_fpaths, assembly_lengths)

    if qconfig.draw_plots:
        # Drawing cumulative plot (aligned contigs)...
        import plotter
        plotter.cumulative_plot(ref_fpath, aligned_contigs_fpaths, aligned_lengths_lists,
                                os.path.join(aligned_stats_dirpath, 'cumulative_plot'),
                                'Cumulative length (aligned contigs)')

        # Drawing NAx and NGAx plots...
        plotter.Nx_plot(aligned_contigs_fpaths, aligned_lengths_lists, aligned_stats_dirpath + '/NAx_plot', 'NAx', assembly_lengths)
        plotter.Nx_plot(aligned_contigs_fpaths, aligned_lengths_lists, aligned_stats_dirpath + '/NGAx_plot', 'NGAx', [reference_length for i in range(len(aligned_contigs_fpaths))])

    logger.info('Done.')
    return report_dict
Esempio n. 19
0
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath,
       aligned_lengths_lists, aligned_stats_dirpath):

    if not os.path.isdir(aligned_stats_dirpath):
        os.mkdir(aligned_stats_dirpath)

    ########################################################################
    report_dict = {'header': []}
    for contigs_fpath in aligned_contigs_fpaths:
        report_dict[qutils.name_from_fpath(contigs_fpath)] = []

    ########################################################################
    logger.print_timestamp()
    logger.main_info('Running NA-NGA calculation...')

    reference_length = sum(fastaparser.get_lengths_from_fastafile(ref_fpath))
    assembly_lengths = []
    for contigs_fpath in aligned_contigs_fpaths:
        assembly_lengths.append(
            sum(fastaparser.get_lengths_from_fastafile(contigs_fpath)))

    import N50
    for i, (contigs_fpath, lens, assembly_len) in enumerate(
            itertools.izip(aligned_contigs_fpaths, aligned_lengths_lists,
                           assembly_lengths)):
        na50 = N50.NG50(lens, assembly_len)
        na75 = N50.NG50(lens, assembly_len, 75)
        la50 = N50.LG50(lens, assembly_len)
        la75 = N50.LG50(lens, assembly_len, 75)
        if not qconfig.is_combined_ref:
            nga50 = N50.NG50(lens, reference_length)
            nga75 = N50.NG50(lens, reference_length, 75)
            lga50 = N50.LG50(lens, reference_length)
            lga75 = N50.LG50(lens, reference_length, 75)

        logger.info(
            '  ' + qutils.index_to_str(i) +
            qutils.label_from_fpath(contigs_fpath) + ', Largest alignment = ' +
            str(max(lens)) + ', NA50 = ' + str(na50) +
            (', NGA50 = ' +
             str(nga50) if not qconfig.is_combined_ref and nga50 else '') +
            ', LA50 = ' + str(la50) +
            (', LGA50 = ' +
             str(lga50) if not qconfig.is_combined_ref and lga50 else ''))
        report = reporting.get(contigs_fpath)
        report.add_field(reporting.Fields.LARGALIGN, max(lens))
        report.add_field(reporting.Fields.NA50, na50)
        report.add_field(reporting.Fields.NA75, na75)
        report.add_field(reporting.Fields.LA50, la50)
        report.add_field(reporting.Fields.LA75, la75)
        if not qconfig.is_combined_ref:
            report.add_field(reporting.Fields.NGA50, nga50)
            report.add_field(reporting.Fields.NGA75, nga75)
            report.add_field(reporting.Fields.LGA50, lga50)
            report.add_field(reporting.Fields.LGA75, lga75)

    ########################################################################
    num_contigs = max([
        len(aligned_lengths_lists[i])
        for i in range(len(aligned_lengths_lists))
    ])

    if json_output_dirpath:
        from libs.html_saver import json_saver
        json_saver.save_assembly_lengths(json_output_dirpath,
                                         aligned_contigs_fpaths,
                                         assembly_lengths)

    # saving to html
    if qconfig.html_report:
        from libs.html_saver import html_saver
        html_saver.save_assembly_lengths(output_dirpath,
                                         aligned_contigs_fpaths,
                                         assembly_lengths)

    import plotter
    if qconfig.draw_plots:
        # Drawing cumulative plot (aligned contigs)...
        plotter.cumulative_plot(
            ref_fpath, aligned_contigs_fpaths, aligned_lengths_lists,
            os.path.join(aligned_stats_dirpath, 'cumulative_plot'),
            'Cumulative length (aligned contigs)')

        # Drawing NAx and NGAx plots...
    plotter.Nx_plot(output_dirpath,
                    num_contigs > qconfig.max_points,
                    aligned_contigs_fpaths,
                    aligned_lengths_lists,
                    aligned_stats_dirpath + '/NAx_plot',
                    'NAx',
                    assembly_lengths,
                    json_output_dir=json_output_dirpath)
    if not qconfig.is_combined_ref:
        plotter.Nx_plot(
            output_dirpath,
            num_contigs > qconfig.max_points,
            aligned_contigs_fpaths,
            aligned_lengths_lists,
            aligned_stats_dirpath + '/NGAx_plot',
            'NGAx',
            [reference_length for i in range(len(aligned_contigs_fpaths))],
            json_output_dir=json_output_dirpath)

    logger.main_info('Done.')
    return report_dict
Esempio n. 20
0
def process_single_file(contigs_fpath, index, nucmer_path_dirpath, genome_stats_dirpath,
                        reference_chromosomes, genes_container, operons_container):
    assembly_name = qutils.name_from_fpath(contigs_fpath)
    assembly_label = qutils.label_from_fpath(contigs_fpath)
    results = dict()

    logger.info('  ' + qutils.index_to_str(index) + assembly_label)

    nucmer_base_fpath = os.path.join(nucmer_path_dirpath, assembly_name + '.coords')
    if qconfig.use_all_alignments:
        nucmer_fpath = nucmer_base_fpath
    else:
        nucmer_fpath = nucmer_base_fpath + '.filtered'

    if not os.path.isfile(nucmer_fpath):
        logger.error('Nucmer\'s coords file (' + nucmer_fpath + ') not found! Try to restart QUAST.',
            indent='  ')

    coordfile = open(nucmer_fpath, 'r')
    for line in coordfile:
        if line.startswith('='):
            break

    # EXAMPLE:
    #    [S1]     [E1]  |     [S2]     [E2]  |  [LEN 1]  [LEN 2]  |  [% IDY]  | [TAGS]
    #=====================================================================================
    #  338980   339138  |     2298     2134  |      159      165  |    79.76  | gi|48994873|gb|U00096.2|	NODE_0_length_6088
    #  374145   374355  |     2306     2097  |      211      210  |    85.45  | gi|48994873|gb|U00096.2|	NODE_0_length_6088

    genome_mapping = {}
    for chr_name, chr_len in reference_chromosomes.iteritems():
        genome_mapping[chr_name] = [0] * (chr_len + 1)

    contig_tuples = fastaparser.read_fasta(contigs_fpath)  # list of FASTA entries (in tuples: name, seq)
    contig_tuples = sorted(contig_tuples, key=lambda contig: len(contig[1]), reverse=True)
    sorted_contigs_names = [name for (name, seq) in contig_tuples]

    genes_in_contigs = [0] * len(sorted_contigs_names) # for cumulative plots: i-th element is the number of genes in i-th contig
    operons_in_contigs = [0] * len(sorted_contigs_names)
    aligned_blocks_by_contig_name = {} # for gene finding: contig_name --> list of AlignedBlock
    for name in sorted_contigs_names:
        aligned_blocks_by_contig_name[name] = []

    for line in coordfile:
        if line.strip() == '':
            break
        s1 = int(line.split('|')[0].split()[0])
        e1 = int(line.split('|')[0].split()[1])
        s2 = int(line.split('|')[1].split()[0])
        e2 = int(line.split('|')[1].split()[1])
        contig_name = line.split()[12].strip()
        chr_name = line.split()[11].strip()

        if chr_name not in genome_mapping:
            logger.error("Something went wrong and chromosome names in your coords file (" + nucmer_base_fpath + ") " \
                         "differ from the names in the reference. Try to remove the file and restart QUAST.")

        aligned_blocks_by_contig_name[contig_name].append(AlignedBlock(seqname=chr_name, start=s1, end=e1))
        if s2 == 0 and e2 == 0:  # special case: circular genome, contig starts on the end of a chromosome and ends in the beginning
            for i in range(s1, len(genome_mapping[chr_name])):
                genome_mapping[chr_name][i] = 1
            for i in range(1, e1 + 1):
                genome_mapping[chr_name][i] = 1
        else: #if s1 <= e1:
            for i in range(s1, e1 + 1):
                genome_mapping[chr_name][i] = 1
    coordfile.close()

    # counting genome coverage and gaps number
    covered_bp = 0
    gaps_count = 0
    gaps_fpath = os.path.join(genome_stats_dirpath, assembly_name + '_gaps.txt')
    gaps_file = open(gaps_fpath, 'w')
    for chr_name, chr_len in reference_chromosomes.iteritems():
        print >>gaps_file, chr_name
        cur_gap_size = 0
        for i in range(1, chr_len + 1):
            if genome_mapping[chr_name][i] == 1:
                if cur_gap_size >= qconfig.min_gap_size:
                    gaps_count += 1
                    print >>gaps_file, i - cur_gap_size, i - 1
                covered_bp += 1
                cur_gap_size = 0
            else:
                cur_gap_size += 1

        if cur_gap_size >= qconfig.min_gap_size:
            gaps_count += 1
            print >>gaps_file, chr_len - cur_gap_size + 1, chr_len
    gaps_file.close()

    results["covered_bp"] = covered_bp
    results["gaps_count"] = gaps_count

    # finding genes and operons
    for container, feature_in_contigs, field, suffix in [
        (genes_container,
         genes_in_contigs,
         reporting.Fields.GENES,
         '_genes.txt'),

        (operons_container,
         operons_in_contigs,
         reporting.Fields.OPERONS,
         '_operons.txt')]:

        if not container.region_list:
            results[field + "_full"] = None
            results[field + "_partial"] = None
            continue

        total_full = 0
        total_partial = 0
        found_fpath = os.path.join(genome_stats_dirpath, assembly_name + suffix)
        found_file = open(found_fpath, 'w')
        print >>found_file, '%s\t\t%s\t%s' % ('ID or #', 'Start', 'End')
        print >>found_file, '============================'

        # 0 - gene is not found,
        # 1 - gene is found,
        # 2 - part of gene is found
        found_list = [0] * len(container.region_list)
        for i, region in enumerate(container.region_list):
            found_list[i] = 0
            for contig_id, name in enumerate(sorted_contigs_names):
                cur_feature_is_found = False
                for cur_block in aligned_blocks_by_contig_name[name]:
                    if container.chr_names_dict[region.seqname] != cur_block.seqname:
                        continue

                    # computing circular genomes
                    if cur_block.start > cur_block.end:
                        blocks = [AlignedBlock(seqname=cur_block.seqname, start=cur_block.start, end=region.end + 1),
                                  AlignedBlock(seqname=cur_block.seqname, start=1, end=cur_block.end)]
                    else:
                        blocks = [cur_block]

                    for block in blocks:
                        if region.end <= block.start or block.end <= region.start:
                            continue
                        elif block.start <= region.start and region.end <= block.end:
                            if found_list[i] == 2:  # already found as partial gene
                                total_partial -= 1
                            found_list[i] = 1
                            total_full += 1
                            i = str(region.id)
                            if i == 'None':
                                i = '# ' + str(region.number + 1)
                            print >>found_file, '%s\t\t%d\t%d' % (i, region.start, region.end)
                            feature_in_contigs[contig_id] += 1  # inc number of found genes/operons in id-th contig

                            cur_feature_is_found = True
                            break
                        elif found_list[i] == 0 and min(region.end, block.end) - max(region.start, block.start) >= qconfig.min_gene_overlap:
                            found_list[i] = 2
                            total_partial += 1
                    if cur_feature_is_found:
                        break
                if cur_feature_is_found:
                    break

        results[field + "_full"] = total_full
        results[field + "_partial"] = total_partial
        found_file.close()

    logger.info('  ' + qutils.index_to_str(index) + 'Analysis is finished.')
    return results, genes_in_contigs, operons_in_contigs
Esempio n. 21
0
def do(ref_fpath, contigs_fpaths, output_dirpath, json_output_dir, results_dir):
    logger.print_timestamp()
    logger.main_info("Running Basic statistics processor...")
    
    if not os.path.isdir(output_dirpath):
        os.mkdir(output_dirpath)

    reference_length = None
    if ref_fpath:
        reference_length = sum(fastaparser.get_lengths_from_fastafile(ref_fpath))
        reference_GC, reference_GC_distribution = GC_content(ref_fpath)

        logger.info('  Reference genome:')
        logger.info('    ' + os.path.basename(ref_fpath) + ', Reference length = ' + str(reference_length) + ', Reference GC % = ' + '%.2f' % reference_GC)
    elif qconfig.estimated_reference_size:
        reference_length = qconfig.estimated_reference_size
        logger.info('  Estimated reference length = ' + str(reference_length))

    if reference_length:
        # Saving the reference in JSON
        if json_output_dir:
            json_saver.save_reference_length(json_output_dir, reference_length)

        # Saving for an HTML report
        if qconfig.html_report:
            from libs.html_saver import html_saver
            html_saver.save_reference_length(results_dir, reference_length)

    logger.info('  Contig files: ')
    lists_of_lengths = []
    numbers_of_Ns = []
    for id, contigs_fpath in enumerate(contigs_fpaths):
        assembly_label = qutils.label_from_fpath(contigs_fpath)

        logger.info('    ' + qutils.index_to_str(id) + assembly_label)
        #lists_of_lengths.append(fastaparser.get_lengths_from_fastafile(contigs_fpath))
        list_of_length = []
        number_of_Ns = 0
        for (name, seq) in fastaparser.read_fasta(contigs_fpath):
            list_of_length.append(len(seq))
            number_of_Ns += seq.count('N')

        lists_of_lengths.append(list_of_length)
        numbers_of_Ns.append(number_of_Ns)

    num_contigs = max([len(list_of_length) for list_of_length in lists_of_lengths])

    multiplicator = 1
    if num_contigs >= (qconfig.max_points*2):
        import math
        multiplicator = int(num_contigs/qconfig.max_points)
        max_points = num_contigs/multiplicator
        lists_of_lengths = [sorted(list, reverse=True) for list in lists_of_lengths]
        corr_lists_of_lengths = [[sum(list_of_length[((i-1)*multiplicator):(i*multiplicator)]) for i in range(1, max_points)
                                  if (i*multiplicator) < len(list_of_length)] for list_of_length in lists_of_lengths]
        for num_list in range(len(corr_lists_of_lengths)):
            last_index = len(corr_lists_of_lengths[num_list])
            corr_lists_of_lengths[num_list].append(sum(lists_of_lengths[num_list][last_index*multiplicator:]))
    else:
        corr_lists_of_lengths = lists_of_lengths

    # saving lengths to JSON
    if json_output_dir:
        json_saver.save_contigs_lengths(json_output_dir, contigs_fpaths, corr_lists_of_lengths)
        json_saver.save_tick_x(json_output_dir, multiplicator)

    if qconfig.html_report:
        from libs.html_saver import html_saver
        html_saver.save_contigs_lengths(results_dir, contigs_fpaths, corr_lists_of_lengths)
        html_saver.save_tick_x(results_dir, multiplicator)

    ########################################################################

    logger.info('  Calculating N50 and L50...')

    list_of_GC_distributions = []
    largest_contig = 0
    import N50
    for id, (contigs_fpath, lengths_list, number_of_Ns) in enumerate(itertools.izip(contigs_fpaths, lists_of_lengths, numbers_of_Ns)):
        report = reporting.get(contigs_fpath)
        n50, l50 = N50.N50_and_L50(lengths_list)
        ng50, lg50 = None, None
        if reference_length:
            ng50, lg50 = N50.NG50_and_LG50(lengths_list, reference_length)
        n75, l75 = N50.N50_and_L50(lengths_list, 75)
        ng75, lg75 = None, None
        if reference_length:
            ng75, lg75 = N50.NG50_and_LG50(lengths_list, reference_length, 75)
        total_length = sum(lengths_list)
        total_GC, GC_distribution = GC_content(contigs_fpath, skip=qconfig.no_gc)
        list_of_GC_distributions.append(GC_distribution)
        logger.info('    ' + qutils.index_to_str(id) +
                    qutils.label_from_fpath(contigs_fpath) + \
                    ', N50 = ' + str(n50) + \
                    ', L50 = ' + str(l50) + \
                    ', Total length = ' + str(total_length) + \
                    ', GC % = ' + ('%.2f' % total_GC if total_GC is not None else 'undefined') + \
                    ', # N\'s per 100 kbp = ' + ' %.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)) if total_length != 0 else 'undefined')
        
        report.add_field(reporting.Fields.N50, n50)
        report.add_field(reporting.Fields.L50, l50)
        if reference_length and not qconfig.is_combined_ref:
            report.add_field(reporting.Fields.NG50, ng50)
            report.add_field(reporting.Fields.LG50, lg50)
        report.add_field(reporting.Fields.N75, n75)
        report.add_field(reporting.Fields.L75, l75)
        if reference_length and not qconfig.is_combined_ref:
            report.add_field(reporting.Fields.NG75, ng75)
            report.add_field(reporting.Fields.LG75, lg75)
        report.add_field(reporting.Fields.CONTIGS, len(lengths_list))
        if lengths_list:
            report.add_field(reporting.Fields.LARGCONTIG, max(lengths_list))
            largest_contig = max(largest_contig, max(lengths_list))
            report.add_field(reporting.Fields.TOTALLEN, total_length)
            if not qconfig.is_combined_ref:
                report.add_field(reporting.Fields.GC, ('%.2f' % total_GC if total_GC is not None else None))
            report.add_field(reporting.Fields.UNCALLED, number_of_Ns)
            report.add_field(reporting.Fields.UNCALLED_PERCENT, ('%.2f' % (float(number_of_Ns) * 100000.0 / float(total_length))))
        if ref_fpath:
            report.add_field(reporting.Fields.REFLEN, int(reference_length))
            if not qconfig.is_combined_ref:
                report.add_field(reporting.Fields.REFGC, '%.2f' % reference_GC)
        elif reference_length:
            report.add_field(reporting.Fields.ESTREFLEN, int(reference_length))

    import math
    qconfig.min_difference = math.ceil((largest_contig/1000)/600)  # divide on height of plot

    if json_output_dir:
        json_saver.save_GC_info(json_output_dir, contigs_fpaths, list_of_GC_distributions)

    if qconfig.html_report and not qconfig.is_combined_ref:
        from libs.html_saver import html_saver
        html_saver.save_GC_info(results_dir, contigs_fpaths, list_of_GC_distributions)

    import plotter
    ########################################################################
    # Drawing Nx and NGx plots...
    plotter.Nx_plot(results_dir, num_contigs > qconfig.max_points, contigs_fpaths, lists_of_lengths, output_dirpath + '/Nx_plot', 'Nx', [], json_output_dir=json_output_dir)
    if reference_length and not qconfig.is_combined_ref:
        plotter.Nx_plot(results_dir, num_contigs > qconfig.max_points, contigs_fpaths, lists_of_lengths, output_dirpath + '/NGx_plot', 'NGx',
                        [reference_length for i in range(len(contigs_fpaths))], json_output_dir=json_output_dir)

    if qconfig.draw_plots:
        ########################################################################import plotter
        # Drawing cumulative plot...
        plotter.cumulative_plot(ref_fpath, contigs_fpaths, lists_of_lengths, output_dirpath + '/cumulative_plot', 'Cumulative length')
        if not qconfig.is_combined_ref:
            ########################################################################
            # Drawing GC content plot...
            list_of_GC_distributions_with_ref = list_of_GC_distributions
            if ref_fpath:
                list_of_GC_distributions_with_ref.append(reference_GC_distribution)
            plotter.GC_content_plot(ref_fpath, contigs_fpaths, list_of_GC_distributions_with_ref, output_dirpath + '/GC_content_plot')

    logger.main_info('Done.')
Esempio n. 22
0
def do(ref_fpath, contigs_fpaths, output_dirpath, json_output_dir, results_dir):
    logger.print_timestamp()
    logger.info("Running Basic statistics processor...")
    
    if not os.path.isdir(output_dirpath):
        os.mkdir(output_dirpath)

    reference_length = None
    if ref_fpath:
        reference_length = sum(fastaparser.get_lengths_from_fastafile(ref_fpath))
        reference_GC, reference_GC_distribution = GC_content(ref_fpath)

        logger.info('  Reference genome:')
        logger.info('    ' + os.path.basename(ref_fpath) + ', Reference length = ' + str(reference_length) + ', Reference GC % = ' + '%.2f' % reference_GC)
    elif qconfig.estimated_reference_size:
        reference_length = qconfig.estimated_reference_size
        logger.info('  Estimated reference length = ' + str(reference_length))

    if reference_length:
        # Saving the reference in JSON
        if json_output_dir:
            json_saver.save_reference_length(json_output_dir, reference_length)

        # Saving for an HTML report
        if qconfig.html_report:
            from libs.html_saver import html_saver
            html_saver.save_reference_length(results_dir, reference_length)

    logger.info('  Contig files: ')
    lists_of_lengths = []
    numbers_of_Ns = []
    for id, contigs_fpath in enumerate(contigs_fpaths):
        assembly_name = qutils.name_from_fpath(contigs_fpath)
        assembly_label = qutils.label_from_fpath(contigs_fpath)

        logger.info('    ' + qutils.index_to_str(id) + assembly_label)
        #lists_of_lengths.append(fastaparser.get_lengths_from_fastafile(contigs_fpath))
        list_of_length = []
        number_of_Ns = 0
        for (name, seq) in fastaparser.read_fasta(contigs_fpath):
            list_of_length.append(len(seq))
            number_of_Ns += seq.count('N')

        lists_of_lengths.append(list_of_length)
        numbers_of_Ns.append(number_of_Ns)

    # saving lengths to JSON
    if json_output_dir:
        json_saver.save_contigs_lengths(json_output_dir, contigs_fpaths, lists_of_lengths)

    if qconfig.html_report:
        from libs.html_saver import html_saver
        html_saver.save_contigs_lengths(results_dir, contigs_fpaths, lists_of_lengths)

    ########################################################################

    logger.info('  Calculating N50 and L50...')

    list_of_GC_distributions = []
    import N50
    for id, (contigs_fpath, lengths_list, number_of_Ns) in enumerate(itertools.izip(contigs_fpaths, lists_of_lengths, numbers_of_Ns)):
        report = reporting.get(contigs_fpath)
        n50, l50 = N50.N50_and_L50(lengths_list)
        ng50, lg50 = None, None
        if reference_length:
            ng50, lg50 = N50.NG50_and_LG50(lengths_list, reference_length)
        n75, l75 = N50.N50_and_L50(lengths_list, 75)
        ng75, lg75 = None, None
        if reference_length:
            ng75, lg75 = N50.NG50_and_LG50(lengths_list, reference_length, 75)
        total_length = sum(lengths_list)
        total_GC, GC_distribution = GC_content(contigs_fpath)
        list_of_GC_distributions.append(GC_distribution)
        logger.info('    ' + qutils.index_to_str(id) +
                    qutils.label_from_fpath(contigs_fpath) + \
                    ', N50 = ' + str(n50) + \
                    ', L50 = ' + str(l50) + \
                    ', Total length = ' + str(total_length) + \
                    ', GC % = ' + ('%.2f' % total_GC if total_GC is not None else 'undefined') + \
                    ', # N\'s per 100 kbp = ' + ' %.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)) )

        report.add_field(reporting.Fields.N50, n50)
        report.add_field(reporting.Fields.L50, l50)
        if reference_length:
            report.add_field(reporting.Fields.NG50, ng50)
            report.add_field(reporting.Fields.LG50, lg50)
        report.add_field(reporting.Fields.N75, n75)
        report.add_field(reporting.Fields.L75, l75)
        if reference_length:
            report.add_field(reporting.Fields.NG75, ng75)
            report.add_field(reporting.Fields.LG75, lg75)
        report.add_field(reporting.Fields.CONTIGS, len(lengths_list))
        report.add_field(reporting.Fields.LARGCONTIG, max(lengths_list))
        report.add_field(reporting.Fields.TOTALLEN, total_length)
        report.add_field(reporting.Fields.GC, ('%.2f' % total_GC if total_GC else None))
        report.add_field(reporting.Fields.UNCALLED, number_of_Ns)
        report.add_field(reporting.Fields.UNCALLED_PERCENT, ('%.2f' % (float(number_of_Ns) * 100000.0 / float(total_length))))
        if ref_fpath:
            report.add_field(reporting.Fields.REFLEN, int(reference_length))
            report.add_field(reporting.Fields.REFGC, '%.2f' % reference_GC)
        elif reference_length:
            report.add_field(reporting.Fields.ESTREFLEN, int(reference_length))

    if json_output_dir:
        json_saver.save_GC_info(json_output_dir, contigs_fpaths, list_of_GC_distributions)

    if qconfig.html_report:
        from libs.html_saver import html_saver
        html_saver.save_GC_info(results_dir, contigs_fpaths, list_of_GC_distributions)

    if qconfig.draw_plots:
        import plotter
        ########################################################################import plotter
        plotter.cumulative_plot(ref_fpath, contigs_fpaths, lists_of_lengths, output_dirpath + '/cumulative_plot', 'Cumulative length')
    
        ########################################################################
        # Drawing GC content plot...
        list_of_GC_distributions_with_ref = list_of_GC_distributions
        if ref_fpath:
            list_of_GC_distributions_with_ref.append(reference_GC_distribution)
        # Drawing cumulative plot...
        plotter.GC_content_plot(ref_fpath, contigs_fpaths, list_of_GC_distributions_with_ref, output_dirpath + '/GC_content_plot')

        ########################################################################
        # Drawing Nx and NGx plots...
        plotter.Nx_plot(contigs_fpaths, lists_of_lengths, output_dirpath + '/Nx_plot', 'Nx', [])
        if reference_length:
            plotter.Nx_plot(contigs_fpaths, lists_of_lengths, output_dirpath + '/NGx_plot', 'NGx', [reference_length for i in range(len(contigs_fpaths))])

    logger.info('Done.')
Esempio n. 23
0
File: quast.py Progetto: ctb/quast
def _parallel_correct_contigs(file_counter, contigs_fpath, corrected_dirpath, labels):

    broken_scaffolds = None
    contigs_fname = os.path.basename(contigs_fpath)
    fname, fasta_ext = qutils.splitext_for_fasta_file(contigs_fname)

    label = labels[file_counter]
    corr_fpath = qutils.unique_corrected_fpath(os.path.join(corrected_dirpath, label + fasta_ext))
    logs = []
    logs.append('  ' + qutils.index_to_str(file_counter, force=(len(labels) > 1)) + '%s ==> %s' % (contigs_fpath, label))

    # if option --scaffolds is specified QUAST adds split version of assemblies to the comparison
    if qconfig.scaffolds:
        logger.info('  ' + qutils.index_to_str(file_counter, force=(len(labels) > 1)) + '  breaking scaffolds into contigs:')
        corr_fpath_wo_ext = os.path.join(corrected_dirpath, qutils.name_from_fpath(corr_fpath))
        broken_scaffolds_fpath = corr_fpath_wo_ext + '_broken' + fasta_ext
        broken_scaffolds_fasta = []
        contigs_counter = 0

        scaffold_counter = 0
        for scaffold_counter, (name, seq) in enumerate(fastaparser.read_fasta(contigs_fpath)):
            if contigs_counter % 100 == 0:
                pass
            if contigs_counter > 520:
                pass
            cumul_contig_length = 0
            total_contigs_for_the_scaf = 1
            cur_contig_start = 0
            while (cumul_contig_length < len(seq)) and (seq.find('N', cumul_contig_length) != -1):
                start = seq.find("N", cumul_contig_length)
                end = start + 1
                while (end != len(seq)) and (seq[end] == 'N'):
                    end += 1

                cumul_contig_length = end + 1
                if (end - start) >= qconfig.Ns_break_threshold:
                    broken_scaffolds_fasta.append(
                        (name.split()[0] + "_" +
                         str(total_contigs_for_the_scaf),
                         seq[cur_contig_start:start]))
                    total_contigs_for_the_scaf += 1
                    cur_contig_start = end

            broken_scaffolds_fasta.append(
                (name.split()[0] + "_" +
                 str(total_contigs_for_the_scaf),
                 seq[cur_contig_start:]))

            contigs_counter += total_contigs_for_the_scaf
        if scaffold_counter + 1 != contigs_counter:
            fastaparser.write_fasta(broken_scaffolds_fpath, broken_scaffolds_fasta)
            logs.append("  " + qutils.index_to_str(file_counter, force=(len(labels) > 1)) +
                        "    %d scaffolds (%s) were broken into %d contigs (%s)" %
                        (scaffold_counter + 1,
                         label,
                         contigs_counter,
                         label + ' broken'))
            broken_scaffolds = (broken_scaffolds_fpath, broken_scaffolds_fpath)
        else:
            logs.append("  " + qutils.index_to_str(file_counter, force=(len(labels) > 1)) +
                    "    WARNING: nothing was broken, skipping '%s broken' from further analysis" % label)

    corr_fpaths = (contigs_fpath, corr_fpath)
    return corr_fpaths, broken_scaffolds, logs
Esempio n. 24
0
def do(fasta_fpaths, gene_lengths, out_dirpath, prokaryote, meta):
    logger.print_timestamp()
    if LICENSE_LIMITATIONS_MODE:
        logger.warning(
            "GeneMark tool can't be started because of license limitations!")
        return

    if meta:
        tool_name = 'MetaGeneMark'
        tool_dirname = 'genemark'
        gmhmm_p_function = gmhmm_p_metagenomic
    elif prokaryote:
        tool_name = 'GeneMarkS'
        tool_dirname = 'genemark'
        gmhmm_p_function = gmhmm_p_everyGC
    else:
        tool_name = 'GeneMark-ES'
        tool_dirname = 'genemark-es'
        gmhmm_p_function = gm_es

    logger.main_info('Running %s...' % tool_name)

    tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, tool_dirname,
                                qconfig.platform_name)
    if not os.path.exists(tool_dirpath):
        logger.warning(
            '  Sorry, can\'t use %s on this platform, skipping gene prediction.'
            % tool_name)
    else:
        successful = install_genemark(
            os.path.join(qconfig.LIBS_LOCATION, 'genemark',
                         qconfig.platform_name))
        if not successful:
            return

        if not os.path.isdir(out_dirpath):
            os.mkdir(out_dirpath)
        tmp_dirpath = os.path.join(out_dirpath, 'tmp')
        if not os.path.isdir(tmp_dirpath):
            os.mkdir(tmp_dirpath)

        n_jobs = min(len(fasta_fpaths), qconfig.max_threads)
        num_threads = max(1, qconfig.max_threads // n_jobs)
        from joblib import Parallel, delayed
        results = Parallel(n_jobs=n_jobs)(
            delayed(predict_genes)(index, fasta_fpath, gene_lengths,
                                   out_dirpath, tool_dirpath, tmp_dirpath,
                                   gmhmm_p_function, prokaryote, num_threads)
            for index, fasta_fpath in enumerate(fasta_fpaths))

        # saving results
        for i, fasta_path in enumerate(fasta_fpaths):
            report = reporting.get(fasta_path)
            unique_count, count = results[i]
            if unique_count is not None:
                report.add_field(reporting.Fields.PREDICTED_GENES_UNIQUE,
                                 unique_count)
            if count is not None:
                report.add_field(reporting.Fields.PREDICTED_GENES, count)
            if unique_count is None and count is None:
                logger.error(
                    '  ' + qutils.index_to_str(i) +
                    'Failed predicting genes in ' +
                    qutils.label_from_fpath(fasta_path) + '. ' +
                    ('File may be too small for GeneMark-ES. Try to use GeneMarkS instead (remove --eukaryote option).'
                     if tool_name == 'GeneMark-ES'
                     and os.path.getsize(fasta_path) < 2000000 else ''))

        if not qconfig.debug:
            shutil.rmtree(tmp_dirpath)

        logger.main_info('Done.')
Esempio n. 25
0
def do(ref_fpath, contigs_fpaths, output_dirpath, json_output_dir,
       results_dir):
    logger.print_timestamp()
    logger.info("Running Basic statistics processor...")

    if not os.path.isdir(output_dirpath):
        os.mkdir(output_dirpath)

    reference_length = None
    if ref_fpath:
        reference_length = sum(
            fastaparser.get_lengths_from_fastafile(ref_fpath))
        reference_GC, reference_GC_distribution = GC_content(ref_fpath)

        logger.info('  Reference genome:')
        logger.info('    ' + os.path.basename(ref_fpath) +
                    ', Reference length = ' + str(reference_length) +
                    ', Reference GC % = ' + '%.2f' % reference_GC)
    elif qconfig.estimated_reference_size:
        reference_length = qconfig.estimated_reference_size
        logger.info('  Estimated reference length = ' + str(reference_length))

    if reference_length:
        # Saving the reference in JSON
        if json_output_dir:
            json_saver.save_reference_length(json_output_dir, reference_length)

        # Saving for an HTML report
        if qconfig.html_report:
            from libs.html_saver import html_saver
            html_saver.save_reference_length(results_dir, reference_length)

    logger.info('  Contig files: ')
    lists_of_lengths = []
    numbers_of_Ns = []
    for id, contigs_fpath in enumerate(contigs_fpaths):
        assembly_name = qutils.name_from_fpath(contigs_fpath)
        assembly_label = qutils.label_from_fpath(contigs_fpath)

        logger.info('    ' + qutils.index_to_str(id) + assembly_label)
        #lists_of_lengths.append(fastaparser.get_lengths_from_fastafile(contigs_fpath))
        list_of_length = []
        number_of_Ns = 0
        for (name, seq) in fastaparser.read_fasta(contigs_fpath):
            list_of_length.append(len(seq))
            number_of_Ns += seq.count('N')

        lists_of_lengths.append(list_of_length)
        numbers_of_Ns.append(number_of_Ns)

    # saving lengths to JSON
    if json_output_dir:
        json_saver.save_contigs_lengths(json_output_dir, contigs_fpaths,
                                        lists_of_lengths)

    if qconfig.html_report:
        from libs.html_saver import html_saver
        html_saver.save_contigs_lengths(results_dir, contigs_fpaths,
                                        lists_of_lengths)

    ########################################################################

    logger.info('  Calculating N50 and L50...')

    list_of_GC_distributions = []
    import N50
    for id, (contigs_fpath, lengths_list, number_of_Ns) in enumerate(
            itertools.izip(contigs_fpaths, lists_of_lengths, numbers_of_Ns)):
        report = reporting.get(contigs_fpath)
        n50, l50 = N50.N50_and_L50(lengths_list)
        ng50, lg50 = None, None
        if reference_length:
            ng50, lg50 = N50.NG50_and_LG50(lengths_list, reference_length)
        n75, l75 = N50.N50_and_L50(lengths_list, 75)
        ng75, lg75 = None, None
        if reference_length:
            ng75, lg75 = N50.NG50_and_LG50(lengths_list, reference_length, 75)
        total_length = sum(lengths_list)
        total_GC, GC_distribution = GC_content(contigs_fpath)
        list_of_GC_distributions.append(GC_distribution)
        logger.info('    ' + qutils.index_to_str(id) +
                    qutils.label_from_fpath(contigs_fpath) + \
                    ', N50 = ' + str(n50) + \
                    ', L50 = ' + str(l50) + \
                    ', Total length = ' + str(total_length) + \
                    ', GC % = ' + ('%.2f' % total_GC if total_GC is not None else 'undefined') + \
                    ', # N\'s per 100 kbp = ' + ' %.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)) )

        report.add_field(reporting.Fields.N50, n50)
        report.add_field(reporting.Fields.L50, l50)
        if reference_length:
            report.add_field(reporting.Fields.NG50, ng50)
            report.add_field(reporting.Fields.LG50, lg50)
        report.add_field(reporting.Fields.N75, n75)
        report.add_field(reporting.Fields.L75, l75)
        if reference_length:
            report.add_field(reporting.Fields.NG75, ng75)
            report.add_field(reporting.Fields.LG75, lg75)
        report.add_field(reporting.Fields.CONTIGS, len(lengths_list))
        report.add_field(reporting.Fields.LARGCONTIG, max(lengths_list))
        report.add_field(reporting.Fields.TOTALLEN, total_length)
        report.add_field(reporting.Fields.GC,
                         ('%.2f' % total_GC if total_GC else None))
        report.add_field(reporting.Fields.UNCALLED, number_of_Ns)
        report.add_field(
            reporting.Fields.UNCALLED_PERCENT,
            ('%.2f' % (float(number_of_Ns) * 100000.0 / float(total_length))))
        if ref_fpath:
            report.add_field(reporting.Fields.REFLEN, int(reference_length))
            report.add_field(reporting.Fields.REFGC, '%.2f' % reference_GC)
        elif reference_length:
            report.add_field(reporting.Fields.ESTREFLEN, int(reference_length))

    if json_output_dir:
        json_saver.save_GC_info(json_output_dir, contigs_fpaths,
                                list_of_GC_distributions)

    if qconfig.html_report:
        from libs.html_saver import html_saver
        html_saver.save_GC_info(results_dir, contigs_fpaths,
                                list_of_GC_distributions)

    if qconfig.draw_plots:
        import plotter
        ########################################################################import plotter
        plotter.cumulative_plot(ref_fpath, contigs_fpaths, lists_of_lengths,
                                output_dirpath + '/cumulative_plot',
                                'Cumulative length')

        ########################################################################
        # Drawing GC content plot...
        list_of_GC_distributions_with_ref = list_of_GC_distributions
        if ref_fpath:
            list_of_GC_distributions_with_ref.append(reference_GC_distribution)
        # Drawing cumulative plot...
        plotter.GC_content_plot(ref_fpath, contigs_fpaths,
                                list_of_GC_distributions_with_ref,
                                output_dirpath + '/GC_content_plot')

        ########################################################################
        # Drawing Nx and NGx plots...
        plotter.Nx_plot(contigs_fpaths, lists_of_lengths,
                        output_dirpath + '/Nx_plot', 'Nx', [])
        if reference_length:
            plotter.Nx_plot(
                contigs_fpaths, lists_of_lengths, output_dirpath + '/NGx_plot',
                'NGx', [reference_length for i in range(len(contigs_fpaths))])

    logger.info('Done.')
Esempio n. 26
0
def do(ref_fpath, contigs_fpaths, output_dirpath, json_output_dir, results_dir):
    logger.print_timestamp()
    logger.main_info("Running Basic statistics processor...")

    if not os.path.isdir(output_dirpath):
        os.mkdir(output_dirpath)

    reference_length = None
    if ref_fpath:
        reference_length = sum(fastaparser.get_lengths_from_fastafile(ref_fpath))
        reference_GC, reference_GC_distribution = GC_content(ref_fpath)

        logger.info("  Reference genome:")
        logger.info(
            "    "
            + os.path.basename(ref_fpath)
            + ", Reference length = "
            + str(reference_length)
            + ", Reference GC % = "
            + "%.2f" % reference_GC
        )
    elif qconfig.estimated_reference_size:
        reference_length = qconfig.estimated_reference_size
        logger.info("  Estimated reference length = " + str(reference_length))

    if reference_length:
        # Saving the reference in JSON
        if json_output_dir:
            json_saver.save_reference_length(json_output_dir, reference_length)

        # Saving for an HTML report
        if qconfig.html_report:
            from libs.html_saver import html_saver

            html_saver.save_reference_length(results_dir, reference_length)

    logger.info("  Contig files: ")
    lists_of_lengths = []
    numbers_of_Ns = []
    for id, contigs_fpath in enumerate(contigs_fpaths):
        assembly_label = qutils.label_from_fpath(contigs_fpath)

        logger.info("    " + qutils.index_to_str(id) + assembly_label)
        # lists_of_lengths.append(fastaparser.get_lengths_from_fastafile(contigs_fpath))
        list_of_length = []
        number_of_Ns = 0
        for (name, seq) in fastaparser.read_fasta(contigs_fpath):
            list_of_length.append(len(seq))
            number_of_Ns += seq.count("N")

        lists_of_lengths.append(list_of_length)
        numbers_of_Ns.append(number_of_Ns)

    num_contigs = max([len(list_of_length) for list_of_length in lists_of_lengths])

    multiplicator = 1
    if num_contigs >= (qconfig.max_points * 2):
        import math

        multiplicator = int(num_contigs / qconfig.max_points)
        max_points = num_contigs / multiplicator
        lists_of_lengths = [sorted(list, reverse=True) for list in lists_of_lengths]
        corr_lists_of_lengths = [
            [
                sum(list_of_length[((i - 1) * multiplicator) : (i * multiplicator)])
                for i in range(1, max_points)
                if (i * multiplicator) < len(list_of_length)
            ]
            for list_of_length in lists_of_lengths
        ]
        for num_list in range(len(corr_lists_of_lengths)):
            last_index = len(corr_lists_of_lengths[num_list])
            corr_lists_of_lengths[num_list].append(sum(lists_of_lengths[num_list][last_index * multiplicator :]))
    else:
        corr_lists_of_lengths = lists_of_lengths

    # saving lengths to JSON
    if json_output_dir:
        json_saver.save_contigs_lengths(json_output_dir, contigs_fpaths, corr_lists_of_lengths)
        json_saver.save_tick_x(json_output_dir, multiplicator)

    if qconfig.html_report:
        from libs.html_saver import html_saver

        html_saver.save_contigs_lengths(results_dir, contigs_fpaths, corr_lists_of_lengths)
        html_saver.save_tick_x(results_dir, multiplicator)

    ########################################################################

    logger.info("  Calculating N50 and L50...")

    list_of_GC_distributions = []
    largest_contig = 0
    import N50

    for id, (contigs_fpath, lengths_list, number_of_Ns) in enumerate(
        itertools.izip(contigs_fpaths, lists_of_lengths, numbers_of_Ns)
    ):
        report = reporting.get(contigs_fpath)
        n50, l50 = N50.N50_and_L50(lengths_list)
        ng50, lg50 = None, None
        if reference_length:
            ng50, lg50 = N50.NG50_and_LG50(lengths_list, reference_length)
        n75, l75 = N50.N50_and_L50(lengths_list, 75)
        ng75, lg75 = None, None
        if reference_length:
            ng75, lg75 = N50.NG50_and_LG50(lengths_list, reference_length, 75)
        total_length = sum(lengths_list)
        total_GC, GC_distribution = GC_content(contigs_fpath, skip=qconfig.no_gc)
        list_of_GC_distributions.append(GC_distribution)
        logger.info(
            "    "
            + qutils.index_to_str(id)
            + qutils.label_from_fpath(contigs_fpath)
            + ", N50 = "
            + str(n50)
            + ", L50 = "
            + str(l50)
            + ", Total length = "
            + str(total_length)
            + ", GC % = "
            + ("%.2f" % total_GC if total_GC is not None else "undefined")
            + ", # N's per 100 kbp = "
            + " %.2f" % (float(number_of_Ns) * 100000.0 / float(total_length))
            if total_length != 0
            else "undefined"
        )

        report.add_field(reporting.Fields.N50, n50)
        report.add_field(reporting.Fields.L50, l50)
        if reference_length and not qconfig.is_combined_ref:
            report.add_field(reporting.Fields.NG50, ng50)
            report.add_field(reporting.Fields.LG50, lg50)
        report.add_field(reporting.Fields.N75, n75)
        report.add_field(reporting.Fields.L75, l75)
        if reference_length and not qconfig.is_combined_ref:
            report.add_field(reporting.Fields.NG75, ng75)
            report.add_field(reporting.Fields.LG75, lg75)
        report.add_field(reporting.Fields.CONTIGS, len(lengths_list))
        if lengths_list:
            report.add_field(reporting.Fields.LARGCONTIG, max(lengths_list))
            largest_contig = max(largest_contig, max(lengths_list))
            report.add_field(reporting.Fields.TOTALLEN, total_length)
            if not qconfig.is_combined_ref:
                report.add_field(reporting.Fields.GC, ("%.2f" % total_GC if total_GC is not None else None))
            report.add_field(reporting.Fields.UNCALLED, number_of_Ns)
            report.add_field(
                reporting.Fields.UNCALLED_PERCENT, ("%.2f" % (float(number_of_Ns) * 100000.0 / float(total_length)))
            )
        if ref_fpath:
            report.add_field(reporting.Fields.REFLEN, int(reference_length))
            if not qconfig.is_combined_ref:
                report.add_field(reporting.Fields.REFGC, "%.2f" % reference_GC)
        elif reference_length:
            report.add_field(reporting.Fields.ESTREFLEN, int(reference_length))

    import math

    qconfig.min_difference = math.ceil((largest_contig / 1000) / 600)  # divide on height of plot

    if json_output_dir:
        json_saver.save_GC_info(json_output_dir, contigs_fpaths, list_of_GC_distributions)

    if qconfig.html_report and not qconfig.is_combined_ref:
        from libs.html_saver import html_saver

        html_saver.save_GC_info(results_dir, contigs_fpaths, list_of_GC_distributions)

    import plotter

    ########################################################################
    # Drawing Nx and NGx plots...
    plotter.Nx_plot(
        results_dir,
        num_contigs > qconfig.max_points,
        contigs_fpaths,
        lists_of_lengths,
        output_dirpath + "/Nx_plot",
        "Nx",
        [],
        json_output_dir=json_output_dir,
    )
    if reference_length and not qconfig.is_combined_ref:
        plotter.Nx_plot(
            results_dir,
            num_contigs > qconfig.max_points,
            contigs_fpaths,
            lists_of_lengths,
            output_dirpath + "/NGx_plot",
            "NGx",
            [reference_length for i in range(len(contigs_fpaths))],
            json_output_dir=json_output_dir,
        )

    if qconfig.draw_plots:
        ########################################################################import plotter
        # Drawing cumulative plot...
        plotter.cumulative_plot(
            ref_fpath, contigs_fpaths, lists_of_lengths, output_dirpath + "/cumulative_plot", "Cumulative length"
        )
        if not qconfig.is_combined_ref:
            ########################################################################
            # Drawing GC content plot...
            list_of_GC_distributions_with_ref = list_of_GC_distributions
            if ref_fpath:
                list_of_GC_distributions_with_ref.append(reference_GC_distribution)
            plotter.GC_content_plot(
                ref_fpath, contigs_fpaths, list_of_GC_distributions_with_ref, output_dirpath + "/GC_content_plot"
            )

    logger.main_info("Done.")
Esempio n. 27
0
def process_single_file(contigs_fpath, index, nucmer_path_dirpath,
                        genome_stats_dirpath, reference_chromosomes,
                        genes_container, operons_container):
    assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath)
    results = dict()
    ref_lengths = {}
    logger.info('  ' + qutils.index_to_str(index) + assembly_label)

    nucmer_base_fpath = os.path.join(nucmer_path_dirpath,
                                     assembly_label + '.coords')
    if qconfig.use_all_alignments:
        nucmer_fpath = nucmer_base_fpath
    else:
        nucmer_fpath = nucmer_base_fpath + '.filtered'

    if not os.path.isfile(nucmer_fpath):
        logger.error('Nucmer\'s coords file (' + nucmer_fpath +
                     ') not found! Try to restart QUAST.',
                     indent='  ')
        return None

    coordfile = open(nucmer_fpath, 'r')
    for line in coordfile:
        if line.startswith('='):
            break

    # EXAMPLE:
    #    [S1]     [E1]  |     [S2]     [E2]  |  [LEN 1]  [LEN 2]  |  [% IDY]  | [TAGS]
    #=====================================================================================
    #  338980   339138  |     2298     2134  |      159      165  |    79.76  | gi|48994873|gb|U00096.2|	NODE_0_length_6088
    #  374145   374355  |     2306     2097  |      211      210  |    85.45  | gi|48994873|gb|U00096.2|	NODE_0_length_6088

    genome_mapping = {}
    for chr_name, chr_len in reference_chromosomes.iteritems():
        genome_mapping[chr_name] = [0] * (chr_len + 1)

    contig_tuples = fastaparser.read_fasta(
        contigs_fpath)  # list of FASTA entries (in tuples: name, seq)
    contig_tuples = sorted(contig_tuples,
                           key=lambda contig: len(contig[1]),
                           reverse=True)
    sorted_contigs_names = [name for (name, seq) in contig_tuples]

    genes_in_contigs = [0] * len(
        sorted_contigs_names
    )  # for cumulative plots: i-th element is the number of genes in i-th contig
    operons_in_contigs = [0] * len(sorted_contigs_names)
    aligned_blocks_by_contig_name = {
    }  # for gene finding: contig_name --> list of AlignedBlock
    for name in sorted_contigs_names:
        aligned_blocks_by_contig_name[name] = []

    for line in coordfile:
        if line.strip() == '':
            break
        s1 = int(line.split('|')[0].split()[0])
        e1 = int(line.split('|')[0].split()[1])
        s2 = int(line.split('|')[1].split()[0])
        e2 = int(line.split('|')[1].split()[1])
        contig_name = line.split()[12].strip()
        chr_name = line.split()[11].strip()

        if chr_name not in genome_mapping:
            logger.error("Something went wrong and chromosome names in your coords file (" + nucmer_base_fpath + ") " \
                         "differ from the names in the reference. Try to remove the file and restart QUAST.")
            return None

        aligned_blocks_by_contig_name[contig_name].append(
            AlignedBlock(seqname=chr_name, start=s1, end=e1))
        if s2 == 0 and e2 == 0:  # special case: circular genome, contig starts on the end of a chromosome and ends in the beginning
            for i in range(s1, len(genome_mapping[chr_name])):
                genome_mapping[chr_name][i] = 1
            for i in range(1, e1 + 1):
                genome_mapping[chr_name][i] = 1
        else:  #if s1 <= e1:
            for i in range(s1, e1 + 1):
                genome_mapping[chr_name][i] = 1
    coordfile.close()

    # counting genome coverage and gaps number
    covered_bp = 0
    gaps_count = 0
    gaps_fpath = os.path.join(genome_stats_dirpath,
                              assembly_label + '_gaps.txt')
    gaps_file = open(gaps_fpath, 'w')
    for chr_name, chr_len in reference_chromosomes.iteritems():
        print >> gaps_file, chr_name
        cur_gap_size = 0
        aligned_len = 0
        for i in range(1, chr_len + 1):
            if genome_mapping[chr_name][i] == 1:
                if cur_gap_size >= qconfig.min_gap_size:
                    gaps_count += 1
                    print >> gaps_file, i - cur_gap_size, i - 1
                aligned_len += 1
                covered_bp += 1
                cur_gap_size = 0
            else:
                cur_gap_size += 1
        ref_lengths[chr_name] = aligned_len
        if cur_gap_size >= qconfig.min_gap_size:
            gaps_count += 1
            print >> gaps_file, chr_len - cur_gap_size + 1, chr_len
    gaps_file.close()

    results["covered_bp"] = covered_bp
    results["gaps_count"] = gaps_count

    # finding genes and operons
    for container, feature_in_contigs, field, suffix in [
        (genes_container, genes_in_contigs, reporting.Fields.GENES,
         '_genes.txt'),
        (operons_container, operons_in_contigs, reporting.Fields.OPERONS,
         '_operons.txt')
    ]:

        if not container.region_list:
            results[field + "_full"] = None
            results[field + "_partial"] = None
            continue

        total_full = 0
        total_partial = 0
        found_fpath = os.path.join(genome_stats_dirpath,
                                   assembly_label + suffix)
        found_file = open(found_fpath, 'w')
        print >> found_file, '%s\t\t%s\t%s\t%s' % ('ID or #', 'Start', 'End',
                                                   'Type')
        print >> found_file, '========================================='

        # 0 - gene is not found,
        # 1 - gene is found,
        # 2 - part of gene is found
        found_list = [0] * len(container.region_list)
        for i, region in enumerate(container.region_list):
            found_list[i] = 0
            for contig_id, name in enumerate(sorted_contigs_names):
                cur_feature_is_found = False
                for cur_block in aligned_blocks_by_contig_name[name]:
                    if container.chr_names_dict[
                            region.seqname] != cur_block.seqname:
                        continue

                    # computing circular genomes
                    if cur_block.start > cur_block.end:
                        blocks = [
                            AlignedBlock(seqname=cur_block.seqname,
                                         start=cur_block.start,
                                         end=region.end + 1),
                            AlignedBlock(seqname=cur_block.seqname,
                                         start=1,
                                         end=cur_block.end)
                        ]
                    else:
                        blocks = [cur_block]

                    for block in blocks:
                        if region.end <= block.start or block.end <= region.start:
                            continue
                        elif block.start <= region.start and region.end <= block.end:
                            if found_list[
                                    i] == 2:  # already found as partial gene
                                total_partial -= 1
                            found_list[i] = 1
                            total_full += 1
                            region_id = str(region.id)
                            if region_id == 'None':
                                region_id = '# ' + str(region.number + 1)
                            print >> found_file, '%s\t\t%d\t%d\tcomplete' % (
                                region_id, region.start, region.end)
                            feature_in_contigs[
                                contig_id] += 1  # inc number of found genes/operons in id-th contig

                            cur_feature_is_found = True
                            break
                        elif found_list[i] == 0 and min(
                                region.end, block.end) - max(
                                    region.start,
                                    block.start) >= qconfig.min_gene_overlap:
                            found_list[i] = 2
                            total_partial += 1
                    if cur_feature_is_found:
                        break
                if cur_feature_is_found:
                    break
            # adding info about partially found genes/operons
            if found_list[i] == 2:  # partial gene/operon
                region_id = str(region.id)
                if region_id == 'None':
                    region_id = '# ' + str(region.number + 1)
                print >> found_file, '%s\t\t%d\t%d\tpartial' % (
                    region_id, region.start, region.end)

        results[field + "_full"] = total_full
        results[field + "_partial"] = total_partial
        found_file.close()

    logger.info('  ' + qutils.index_to_str(index) + 'Analysis is finished.')

    return ref_lengths, (results, genes_in_contigs, operons_in_contigs)
Esempio n. 28
0
def _parallel_correct_contigs(file_counter, contigs_fpath, corrected_dirpath,
                              labels):

    broken_scaffolds = None
    contigs_fname = os.path.basename(contigs_fpath)
    fname, fasta_ext = qutils.splitext_for_fasta_file(contigs_fname)

    label = labels[file_counter]
    corr_fpath = qutils.unique_corrected_fpath(
        os.path.join(corrected_dirpath, label + fasta_ext))
    logs = []
    logs.append('  ' +
                qutils.index_to_str(file_counter, force=(len(labels) > 1)) +
                '%s ==> %s' % (contigs_fpath, label))

    # if option --scaffolds is specified QUAST adds split version of assemblies to the comparison
    if qconfig.scaffolds:
        logger.info(
            '  ' + qutils.index_to_str(file_counter, force=(len(labels) > 1)) +
            '  breaking scaffolds into contigs:')
        corr_fpath_wo_ext = os.path.join(corrected_dirpath,
                                         qutils.name_from_fpath(corr_fpath))
        broken_scaffolds_fpath = corr_fpath_wo_ext + '_broken' + fasta_ext
        broken_scaffolds_fasta = []
        contigs_counter = 0

        scaffold_counter = 0
        for scaffold_counter, (name, seq) in enumerate(
                fastaparser.read_fasta(contigs_fpath)):
            if contigs_counter % 100 == 0:
                pass
            if contigs_counter > 520:
                pass
            cumul_contig_length = 0
            total_contigs_for_the_scaf = 1
            cur_contig_start = 0
            while (cumul_contig_length < len(seq)) and (seq.find(
                    'N', cumul_contig_length) != -1):
                start = seq.find("N", cumul_contig_length)
                end = start + 1
                while (end != len(seq)) and (seq[end] == 'N'):
                    end += 1

                cumul_contig_length = end + 1
                if (end - start) >= qconfig.Ns_break_threshold:
                    broken_scaffolds_fasta.append(
                        (name.split()[0] + "_" +
                         str(total_contigs_for_the_scaf),
                         seq[cur_contig_start:start]))
                    total_contigs_for_the_scaf += 1
                    cur_contig_start = end

            broken_scaffolds_fasta.append(
                (name.split()[0] + "_" + str(total_contigs_for_the_scaf),
                 seq[cur_contig_start:]))

            contigs_counter += total_contigs_for_the_scaf
        if scaffold_counter + 1 != contigs_counter:
            fastaparser.write_fasta(broken_scaffolds_fpath,
                                    broken_scaffolds_fasta)
            logs.append(
                "  " +
                qutils.index_to_str(file_counter, force=(len(labels) > 1)) +
                "    %d scaffolds (%s) were broken into %d contigs (%s)" %
                (scaffold_counter + 1, label, contigs_counter,
                 label + ' broken'))
            broken_scaffolds = (broken_scaffolds_fpath, broken_scaffolds_fpath)
        else:
            logs.append(
                "  " +
                qutils.index_to_str(file_counter, force=(len(labels) > 1)) +
                "    WARNING: nothing was broken, skipping '%s broken' from further analysis"
                % label)

    corr_fpaths = (contigs_fpath, corr_fpath)
    return corr_fpaths, broken_scaffolds, logs