Exemple #1
0
def save_tick_x(results_dirpath, tick_x):
    json_fpath = json_saver.save_tick_x(results_dirpath, tick_x)
    if json_fpath:
        append(results_dirpath, json_fpath, "tickX")
Exemple #2
0
def do(ref_fpath, contigs_fpaths, output_dirpath, json_output_dir, results_dir):
    logger.print_timestamp()
    logger.main_info("Running Basic statistics processor...")

    if not os.path.isdir(output_dirpath):
        os.mkdir(output_dirpath)

    reference_length = None
    if ref_fpath:
        reference_length = sum(fastaparser.get_lengths_from_fastafile(ref_fpath))
        reference_GC, reference_GC_distribution = GC_content(ref_fpath)

        logger.info("  Reference genome:")
        logger.info(
            "    "
            + os.path.basename(ref_fpath)
            + ", Reference length = "
            + str(reference_length)
            + ", Reference GC % = "
            + "%.2f" % reference_GC
        )
    elif qconfig.estimated_reference_size:
        reference_length = qconfig.estimated_reference_size
        logger.info("  Estimated reference length = " + str(reference_length))

    if reference_length:
        # Saving the reference in JSON
        if json_output_dir:
            json_saver.save_reference_length(json_output_dir, reference_length)

        # Saving for an HTML report
        if qconfig.html_report:
            from libs.html_saver import html_saver

            html_saver.save_reference_length(results_dir, reference_length)

    logger.info("  Contig files: ")
    lists_of_lengths = []
    numbers_of_Ns = []
    for id, contigs_fpath in enumerate(contigs_fpaths):
        assembly_label = qutils.label_from_fpath(contigs_fpath)

        logger.info("    " + qutils.index_to_str(id) + assembly_label)
        # lists_of_lengths.append(fastaparser.get_lengths_from_fastafile(contigs_fpath))
        list_of_length = []
        number_of_Ns = 0
        for (name, seq) in fastaparser.read_fasta(contigs_fpath):
            list_of_length.append(len(seq))
            number_of_Ns += seq.count("N")

        lists_of_lengths.append(list_of_length)
        numbers_of_Ns.append(number_of_Ns)

    num_contigs = max([len(list_of_length) for list_of_length in lists_of_lengths])

    multiplicator = 1
    if num_contigs >= (qconfig.max_points * 2):
        import math

        multiplicator = int(num_contigs / qconfig.max_points)
        max_points = num_contigs / multiplicator
        lists_of_lengths = [sorted(list, reverse=True) for list in lists_of_lengths]
        corr_lists_of_lengths = [
            [
                sum(list_of_length[((i - 1) * multiplicator) : (i * multiplicator)])
                for i in range(1, max_points)
                if (i * multiplicator) < len(list_of_length)
            ]
            for list_of_length in lists_of_lengths
        ]
        for num_list in range(len(corr_lists_of_lengths)):
            last_index = len(corr_lists_of_lengths[num_list])
            corr_lists_of_lengths[num_list].append(sum(lists_of_lengths[num_list][last_index * multiplicator :]))
    else:
        corr_lists_of_lengths = lists_of_lengths

    # saving lengths to JSON
    if json_output_dir:
        json_saver.save_contigs_lengths(json_output_dir, contigs_fpaths, corr_lists_of_lengths)
        json_saver.save_tick_x(json_output_dir, multiplicator)

    if qconfig.html_report:
        from libs.html_saver import html_saver

        html_saver.save_contigs_lengths(results_dir, contigs_fpaths, corr_lists_of_lengths)
        html_saver.save_tick_x(results_dir, multiplicator)

    ########################################################################

    logger.info("  Calculating N50 and L50...")

    list_of_GC_distributions = []
    largest_contig = 0
    import N50

    for id, (contigs_fpath, lengths_list, number_of_Ns) in enumerate(
        itertools.izip(contigs_fpaths, lists_of_lengths, numbers_of_Ns)
    ):
        report = reporting.get(contigs_fpath)
        n50, l50 = N50.N50_and_L50(lengths_list)
        ng50, lg50 = None, None
        if reference_length:
            ng50, lg50 = N50.NG50_and_LG50(lengths_list, reference_length)
        n75, l75 = N50.N50_and_L50(lengths_list, 75)
        ng75, lg75 = None, None
        if reference_length:
            ng75, lg75 = N50.NG50_and_LG50(lengths_list, reference_length, 75)
        total_length = sum(lengths_list)
        total_GC, GC_distribution = GC_content(contigs_fpath, skip=qconfig.no_gc)
        list_of_GC_distributions.append(GC_distribution)
        logger.info(
            "    "
            + qutils.index_to_str(id)
            + qutils.label_from_fpath(contigs_fpath)
            + ", N50 = "
            + str(n50)
            + ", L50 = "
            + str(l50)
            + ", Total length = "
            + str(total_length)
            + ", GC % = "
            + ("%.2f" % total_GC if total_GC is not None else "undefined")
            + ", # N's per 100 kbp = "
            + " %.2f" % (float(number_of_Ns) * 100000.0 / float(total_length))
            if total_length != 0
            else "undefined"
        )

        report.add_field(reporting.Fields.N50, n50)
        report.add_field(reporting.Fields.L50, l50)
        if reference_length and not qconfig.is_combined_ref:
            report.add_field(reporting.Fields.NG50, ng50)
            report.add_field(reporting.Fields.LG50, lg50)
        report.add_field(reporting.Fields.N75, n75)
        report.add_field(reporting.Fields.L75, l75)
        if reference_length and not qconfig.is_combined_ref:
            report.add_field(reporting.Fields.NG75, ng75)
            report.add_field(reporting.Fields.LG75, lg75)
        report.add_field(reporting.Fields.CONTIGS, len(lengths_list))
        if lengths_list:
            report.add_field(reporting.Fields.LARGCONTIG, max(lengths_list))
            largest_contig = max(largest_contig, max(lengths_list))
            report.add_field(reporting.Fields.TOTALLEN, total_length)
            if not qconfig.is_combined_ref:
                report.add_field(reporting.Fields.GC, ("%.2f" % total_GC if total_GC is not None else None))
            report.add_field(reporting.Fields.UNCALLED, number_of_Ns)
            report.add_field(
                reporting.Fields.UNCALLED_PERCENT, ("%.2f" % (float(number_of_Ns) * 100000.0 / float(total_length)))
            )
        if ref_fpath:
            report.add_field(reporting.Fields.REFLEN, int(reference_length))
            if not qconfig.is_combined_ref:
                report.add_field(reporting.Fields.REFGC, "%.2f" % reference_GC)
        elif reference_length:
            report.add_field(reporting.Fields.ESTREFLEN, int(reference_length))

    import math

    qconfig.min_difference = math.ceil((largest_contig / 1000) / 600)  # divide on height of plot

    if json_output_dir:
        json_saver.save_GC_info(json_output_dir, contigs_fpaths, list_of_GC_distributions)

    if qconfig.html_report and not qconfig.is_combined_ref:
        from libs.html_saver import html_saver

        html_saver.save_GC_info(results_dir, contigs_fpaths, list_of_GC_distributions)

    import plotter

    ########################################################################
    # Drawing Nx and NGx plots...
    plotter.Nx_plot(
        results_dir,
        num_contigs > qconfig.max_points,
        contigs_fpaths,
        lists_of_lengths,
        output_dirpath + "/Nx_plot",
        "Nx",
        [],
        json_output_dir=json_output_dir,
    )
    if reference_length and not qconfig.is_combined_ref:
        plotter.Nx_plot(
            results_dir,
            num_contigs > qconfig.max_points,
            contigs_fpaths,
            lists_of_lengths,
            output_dirpath + "/NGx_plot",
            "NGx",
            [reference_length for i in range(len(contigs_fpaths))],
            json_output_dir=json_output_dir,
        )

    if qconfig.draw_plots:
        ########################################################################import plotter
        # Drawing cumulative plot...
        plotter.cumulative_plot(
            ref_fpath, contigs_fpaths, lists_of_lengths, output_dirpath + "/cumulative_plot", "Cumulative length"
        )
        if not qconfig.is_combined_ref:
            ########################################################################
            # Drawing GC content plot...
            list_of_GC_distributions_with_ref = list_of_GC_distributions
            if ref_fpath:
                list_of_GC_distributions_with_ref.append(reference_GC_distribution)
            plotter.GC_content_plot(
                ref_fpath, contigs_fpaths, list_of_GC_distributions_with_ref, output_dirpath + "/GC_content_plot"
            )

    logger.main_info("Done.")
Exemple #3
0
def save_tick_x(results_dirpath, tick_x):
    json_fpath = json_saver.save_tick_x(results_dirpath, tick_x)
    if json_fpath:
        append(results_dirpath, json_fpath, 'tickX')
Exemple #4
0
def do(ref_fpath, contigs_fpaths, output_dirpath, json_output_dir, results_dir):
    logger.print_timestamp()
    logger.main_info("Running Basic statistics processor...")
    
    if not os.path.isdir(output_dirpath):
        os.mkdir(output_dirpath)

    reference_length = None
    if ref_fpath:
        reference_length = sum(fastaparser.get_lengths_from_fastafile(ref_fpath))
        reference_GC, reference_GC_distribution = GC_content(ref_fpath)

        logger.info('  Reference genome:')
        logger.info('    ' + os.path.basename(ref_fpath) + ', Reference length = ' + str(reference_length) + ', Reference GC % = ' + '%.2f' % reference_GC)
    elif qconfig.estimated_reference_size:
        reference_length = qconfig.estimated_reference_size
        logger.info('  Estimated reference length = ' + str(reference_length))

    if reference_length:
        # Saving the reference in JSON
        if json_output_dir:
            json_saver.save_reference_length(json_output_dir, reference_length)

        # Saving for an HTML report
        if qconfig.html_report:
            from libs.html_saver import html_saver
            html_saver.save_reference_length(results_dir, reference_length)

    logger.info('  Contig files: ')
    lists_of_lengths = []
    numbers_of_Ns = []
    for id, contigs_fpath in enumerate(contigs_fpaths):
        assembly_label = qutils.label_from_fpath(contigs_fpath)

        logger.info('    ' + qutils.index_to_str(id) + assembly_label)
        #lists_of_lengths.append(fastaparser.get_lengths_from_fastafile(contigs_fpath))
        list_of_length = []
        number_of_Ns = 0
        for (name, seq) in fastaparser.read_fasta(contigs_fpath):
            list_of_length.append(len(seq))
            number_of_Ns += seq.count('N')

        lists_of_lengths.append(list_of_length)
        numbers_of_Ns.append(number_of_Ns)

    num_contigs = max([len(list_of_length) for list_of_length in lists_of_lengths])

    multiplicator = 1
    if num_contigs >= (qconfig.max_points*2):
        import math
        multiplicator = int(num_contigs/qconfig.max_points)
        max_points = num_contigs/multiplicator
        lists_of_lengths = [sorted(list, reverse=True) for list in lists_of_lengths]
        corr_lists_of_lengths = [[sum(list_of_length[((i-1)*multiplicator):(i*multiplicator)]) for i in range(1, max_points)
                                  if (i*multiplicator) < len(list_of_length)] for list_of_length in lists_of_lengths]
        for num_list in range(len(corr_lists_of_lengths)):
            last_index = len(corr_lists_of_lengths[num_list])
            corr_lists_of_lengths[num_list].append(sum(lists_of_lengths[num_list][last_index*multiplicator:]))
    else:
        corr_lists_of_lengths = lists_of_lengths

    # saving lengths to JSON
    if json_output_dir:
        json_saver.save_contigs_lengths(json_output_dir, contigs_fpaths, corr_lists_of_lengths)
        json_saver.save_tick_x(json_output_dir, multiplicator)

    if qconfig.html_report:
        from libs.html_saver import html_saver
        html_saver.save_contigs_lengths(results_dir, contigs_fpaths, corr_lists_of_lengths)
        html_saver.save_tick_x(results_dir, multiplicator)

    ########################################################################

    logger.info('  Calculating N50 and L50...')

    list_of_GC_distributions = []
    largest_contig = 0
    import N50
    for id, (contigs_fpath, lengths_list, number_of_Ns) in enumerate(itertools.izip(contigs_fpaths, lists_of_lengths, numbers_of_Ns)):
        report = reporting.get(contigs_fpath)
        n50, l50 = N50.N50_and_L50(lengths_list)
        ng50, lg50 = None, None
        if reference_length:
            ng50, lg50 = N50.NG50_and_LG50(lengths_list, reference_length)
        n75, l75 = N50.N50_and_L50(lengths_list, 75)
        ng75, lg75 = None, None
        if reference_length:
            ng75, lg75 = N50.NG50_and_LG50(lengths_list, reference_length, 75)
        total_length = sum(lengths_list)
        total_GC, GC_distribution = GC_content(contigs_fpath, skip=qconfig.no_gc)
        list_of_GC_distributions.append(GC_distribution)
        logger.info('    ' + qutils.index_to_str(id) +
                    qutils.label_from_fpath(contigs_fpath) + \
                    ', N50 = ' + str(n50) + \
                    ', L50 = ' + str(l50) + \
                    ', Total length = ' + str(total_length) + \
                    ', GC % = ' + ('%.2f' % total_GC if total_GC is not None else 'undefined') + \
                    ', # N\'s per 100 kbp = ' + ' %.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)) if total_length != 0 else 'undefined')
        
        report.add_field(reporting.Fields.N50, n50)
        report.add_field(reporting.Fields.L50, l50)
        if reference_length and not qconfig.is_combined_ref:
            report.add_field(reporting.Fields.NG50, ng50)
            report.add_field(reporting.Fields.LG50, lg50)
        report.add_field(reporting.Fields.N75, n75)
        report.add_field(reporting.Fields.L75, l75)
        if reference_length and not qconfig.is_combined_ref:
            report.add_field(reporting.Fields.NG75, ng75)
            report.add_field(reporting.Fields.LG75, lg75)
        report.add_field(reporting.Fields.CONTIGS, len(lengths_list))
        if lengths_list:
            report.add_field(reporting.Fields.LARGCONTIG, max(lengths_list))
            largest_contig = max(largest_contig, max(lengths_list))
            report.add_field(reporting.Fields.TOTALLEN, total_length)
            if not qconfig.is_combined_ref:
                report.add_field(reporting.Fields.GC, ('%.2f' % total_GC if total_GC is not None else None))
            report.add_field(reporting.Fields.UNCALLED, number_of_Ns)
            report.add_field(reporting.Fields.UNCALLED_PERCENT, ('%.2f' % (float(number_of_Ns) * 100000.0 / float(total_length))))
        if ref_fpath:
            report.add_field(reporting.Fields.REFLEN, int(reference_length))
            if not qconfig.is_combined_ref:
                report.add_field(reporting.Fields.REFGC, '%.2f' % reference_GC)
        elif reference_length:
            report.add_field(reporting.Fields.ESTREFLEN, int(reference_length))

    import math
    qconfig.min_difference = math.ceil((largest_contig/1000)/600)  # divide on height of plot

    if json_output_dir:
        json_saver.save_GC_info(json_output_dir, contigs_fpaths, list_of_GC_distributions)

    if qconfig.html_report and not qconfig.is_combined_ref:
        from libs.html_saver import html_saver
        html_saver.save_GC_info(results_dir, contigs_fpaths, list_of_GC_distributions)

    import plotter
    ########################################################################
    # Drawing Nx and NGx plots...
    plotter.Nx_plot(results_dir, num_contigs > qconfig.max_points, contigs_fpaths, lists_of_lengths, output_dirpath + '/Nx_plot', 'Nx', [], json_output_dir=json_output_dir)
    if reference_length and not qconfig.is_combined_ref:
        plotter.Nx_plot(results_dir, num_contigs > qconfig.max_points, contigs_fpaths, lists_of_lengths, output_dirpath + '/NGx_plot', 'NGx',
                        [reference_length for i in range(len(contigs_fpaths))], json_output_dir=json_output_dir)

    if qconfig.draw_plots:
        ########################################################################import plotter
        # Drawing cumulative plot...
        plotter.cumulative_plot(ref_fpath, contigs_fpaths, lists_of_lengths, output_dirpath + '/cumulative_plot', 'Cumulative length')
        if not qconfig.is_combined_ref:
            ########################################################################
            # Drawing GC content plot...
            list_of_GC_distributions_with_ref = list_of_GC_distributions
            if ref_fpath:
                list_of_GC_distributions_with_ref.append(reference_GC_distribution)
            plotter.GC_content_plot(ref_fpath, contigs_fpaths, list_of_GC_distributions_with_ref, output_dirpath + '/GC_content_plot')

    logger.main_info('Done.')