Beispiel #1
0
def js_data_gen(assemblies,
                contigs_fpaths,
                chromosomes_length,
                output_dirpath,
                structures_by_labels,
                contigs_by_assemblies,
                ambiguity_alignments_by_labels=None,
                contig_names_by_refs=None,
                ref_fpath=None,
                stdout_pattern=None,
                features_data=None,
                gc_fpath=None,
                cov_fpath=None,
                physical_cov_fpath=None,
                json_output_dir=None):
    chr_names = []
    if chromosomes_length and assemblies:
        chr_to_aligned_blocks = OrderedDict()
        chr_names = list(chromosomes_length.keys())
        for assembly in assemblies.assemblies:
            chr_to_aligned_blocks[assembly.label] = defaultdict(list)
            similar_correct = 0
            similar_misassembled = 0

            for align in assembly.alignments:
                chr_to_aligned_blocks[assembly.label][align.ref_name].append(
                    align)
                if align.similar:
                    if align.misassembled:
                        similar_misassembled += 1
                    else:
                        similar_correct += 1
            report = reporting.get(assembly.fpath)
            report.add_field(reporting.Fields.SIMILAR_CONTIGS, similar_correct)
            report.add_field(reporting.Fields.SIMILAR_MIS_BLOCKS,
                             similar_misassembled)

    main_menu_fpath = os.path.join(output_dirpath, qconfig.icarus_html_fname)
    output_all_files_dir_path = os.path.join(output_dirpath,
                                             qconfig.icarus_dirname)
    if not os.path.exists(output_all_files_dir_path):
        os.mkdir(output_all_files_dir_path)

    chr_full_names, contig_names_by_refs = group_references(
        chr_names, contig_names_by_refs, chromosomes_length, ref_fpath)

    cov_data, max_depth = parse_cov_fpath(cov_fpath, chr_names, chr_full_names,
                                          contig_names_by_refs)
    physical_cov_data, physical_max_depth = parse_cov_fpath(
        physical_cov_fpath, chr_names, chr_full_names, contig_names_by_refs)
    gc_data, max_gc = parse_cov_fpath(gc_fpath, chr_names, chr_full_names,
                                      contig_names_by_refs)

    chr_sizes = {}
    num_contigs = {}
    aligned_bases = genome_analyzer.get_ref_aligned_lengths()
    nx_marks = [
        reporting.Fields.N50, reporting.Fields.N75, reporting.Fields.NG50,
        reporting.Fields.NG75
    ]

    assemblies_data, assemblies_contig_size_data, assemblies_n50 = get_assemblies_data(
        contigs_fpaths, output_all_files_dir_path, stdout_pattern, nx_marks)

    ref_contigs_dict = {}
    chr_lengths_dict = {}

    ref_data = 'var references_by_id = {};\n'
    chr_names_by_id = dict(
        (chrom, str(i)) for i, chrom in enumerate(chr_names))
    for chrom, i in chr_names_by_id.items():
        ref_data += 'references_by_id["' + str(i) + '"] = "' + chrom + '";\n'
    for i, chr in enumerate(chr_full_names):
        if contig_names_by_refs:
            ref_contigs = [
                contig for contig in chr_names
                if contig_names_by_refs[contig] == chr
            ]
        elif len(chr_full_names) == 1:
            ref_contigs = chr_names
        else:
            ref_contigs = [chr]
        ref_contigs_dict[chr] = ref_contigs
        chr_lengths_dict[chr] = [0] + [
            chromosomes_length[contig] for contig in ref_contigs
        ]

    num_misassemblies = defaultdict(int)
    aligned_bases_by_chr = defaultdict(list)
    aligned_assemblies = defaultdict(set)
    for i, chr in enumerate(chr_full_names):
        ref_contigs = ref_contigs_dict[chr]
        chr_lengths = chr_lengths_dict[chr]
        chr_size = sum([chromosomes_length[contig] for contig in ref_contigs])
        chr_sizes[chr] = chr_size
        num_contigs[chr] = len(ref_contigs)
        data_str = []
        data_str.append('var chromosomes_len = {};')
        for ref_contig in ref_contigs:
            l = chromosomes_length[ref_contig]
            data_str.append('chromosomes_len["' + ref_contig + '"] = ' +
                            str(l) + ';')
            aligned_bases_by_chr[chr].extend(aligned_bases[ref_contig])

        cov_data_str = format_cov_data(chr, cov_data, 'coverage_data',
                                       max_depth,
                                       'reads_max_depth') if cov_data else None
        physical_cov_data_str = format_cov_data(chr, physical_cov_data, 'physical_coverage_data', physical_max_depth, 'physical_max_depth') \
            if physical_cov_data else None
        gc_data_str = format_cov_data(chr, gc_data, 'gc_data', 100,
                                      'max_gc') if gc_data else None

        alignment_viewer_fpath, ref_data_str, contigs_structure_str, additional_assemblies_data, ms_selectors, num_misassemblies[chr], aligned_assemblies[chr] = \
            prepare_alignment_data_for_one_ref(chr, chr_full_names, chr_names_by_id, ref_contigs, data_str, chr_to_aligned_blocks, structures_by_labels,
                                               contigs_by_assemblies, ambiguity_alignments_by_labels=ambiguity_alignments_by_labels,
                                               cov_data_str=cov_data_str, physical_cov_data_str=physical_cov_data_str, gc_data_str=gc_data_str,
                                               contig_names_by_refs=contig_names_by_refs, output_dir_path=output_all_files_dir_path)
        ref_name = qutils.name_from_fpath(ref_fpath)
        save_alignment_data_for_one_ref(
            chr,
            ref_contigs,
            ref_name,
            json_output_dir,
            alignment_viewer_fpath,
            ref_data_str,
            ms_selectors,
            ref_data=ref_data,
            features_data=features_data,
            assemblies_data=assemblies_data,
            contigs_structure_str=contigs_structure_str,
            additional_assemblies_data=additional_assemblies_data)

    contigs_sizes_str, too_many_contigs = get_contigs_data(
        contigs_by_assemblies, nx_marks, assemblies_n50, structures_by_labels,
        contig_names_by_refs, chr_names, chr_full_names)
    all_data = assemblies_data + assemblies_contig_size_data + contigs_sizes_str
    save_contig_size_html(output_all_files_dir_path, json_output_dir,
                          too_many_contigs, all_data)

    icarus_links = defaultdict(list)
    if len(chr_full_names) > 1:
        chr_link = qconfig.icarus_html_fname
        icarus_links["links"].append(chr_link)
        icarus_links["links_names"].append(qconfig.icarus_link)

    main_menu_template_fpath = html_saver.get_real_path(
        qconfig.icarus_menu_template_fname)
    main_data_dict = dict()

    labels = [
        qconfig.assembly_labels_by_fpath[contigs_fpath]
        for contigs_fpath in contigs_fpaths
    ]
    main_data_dict['assemblies'] = labels
    html_saver.save_icarus_data(json_output_dir, ', '.join(labels),
                                'assemblies')

    contig_size_browser_fpath = os.path.join(qconfig.icarus_dirname,
                                             qconfig.contig_size_viewer_fname)
    main_data_dict['contig_size_html'] = contig_size_browser_fpath
    html_saver.save_icarus_data(json_output_dir, contig_size_browser_fpath,
                                'contig_size_html')
    if not chr_names:
        icarus_links["links"].append(contig_size_browser_fpath)
        icarus_links["links_names"].append(qconfig.icarus_link)

    if chr_full_names and (len(chr_full_names) > 1 or qconfig.is_combined_ref):
        main_data_dict['table_references'] = {'references': []}
        num_aligned_assemblies = [
            len(aligned_assemblies[chr]) for chr in chr_full_names
        ]
        is_unaligned_asm_exists = len(set(num_aligned_assemblies)) > 1
        if is_unaligned_asm_exists:
            main_data_dict['table_references']['th_assemblies'] = True
        for chr in sorted(chr_full_names, key=natural_sort):
            chr_link, chr_name, chr_genome, chr_size, tooltip = get_info_by_chr(
                chr,
                aligned_bases_by_chr,
                chr_sizes,
                contigs_fpaths,
                contig_names_by_refs,
                one_chromosome=len(chr_full_names) == 1)
            reference_dict = dict()
            reference_dict['chr_link'] = chr_link
            reference_dict['tooltip'] = tooltip
            reference_dict['chr_name'] = os.path.basename(chr_name)
            reference_dict['num_contigs'] = str(num_contigs[chr])
            reference_dict['chr_size'] = format_long_numbers(chr_size)
            if is_unaligned_asm_exists:
                reference_dict['num_assemblies'] = str(
                    len(aligned_assemblies[chr]))
            reference_dict['chr_gf'] = '%.3f' % chr_genome
            reference_dict['num_misassemblies'] = str(num_misassemblies[chr])
            main_data_dict['table_references']['references'].append(
                reference_dict)
        html_saver.save_icarus_data(json_output_dir,
                                    main_data_dict['table_references'],
                                    'table_references',
                                    as_text=False)
    else:
        if chr_full_names:
            chr = chr_full_names[0]
            chr_link, chr_name, chr_genome, chr_size, tooltip = get_info_by_chr(
                chr,
                aligned_bases_by_chr,
                chr_sizes,
                contigs_fpaths,
                contig_names_by_refs,
                one_chromosome=True)
            main_data_dict['one_reference'] = dict()
            main_data_dict['one_reference']['alignment_link'] = chr_link
            main_data_dict['one_reference']['ref_fpath'] = os.path.basename(
                ref_fpath)
            main_data_dict['one_reference']['ref_fragments'] = str(
                num_contigs[chr])
            main_data_dict['one_reference']['ref_size'] = format_long_numbers(
                chr_size)
            main_data_dict['one_reference']['ref_gf'] = '%.3f' % chr_genome
            main_data_dict['one_reference']['num_misassemblies'] = str(
                num_misassemblies[chr])
            icarus_links["links"].append(chr_link)
            icarus_links["links_names"].append(qconfig.icarus_link)
            html_saver.save_icarus_data(json_output_dir,
                                        main_data_dict['one_reference'],
                                        'menu_reference',
                                        as_text=False)
    html_saver.save_icarus_html(main_menu_template_fpath, main_menu_fpath,
                                main_data_dict)
    html_saver.save_icarus_links(output_dirpath, icarus_links)

    return main_menu_fpath
Beispiel #2
0
def get_contigs_data(contigs_by_assemblies, nx_marks, assemblies_n50, structures_by_labels, contig_names_by_refs, ref_names,
                     chr_full_names):
    additional_data = []
    additional_data.append('var links_to_chromosomes;')
    one_html = len(chr_full_names) == 1
    if not one_html:
        additional_data.append('links_to_chromosomes = {};')
        for ref_name in ref_names:
            chr_name = ref_name
            if contig_names_by_refs and ref_name in contig_names_by_refs:
                chr_name = contig_names_by_refs[ref_name]
            chr_name = get_html_name(chr_name, chr_full_names)
            additional_data.append('links_to_chromosomes["' + ref_name + '"] = "' + chr_name + '";')
    contigs_sizes_str = ['var contig_data = {};']
    contigs_sizes_str.append('var chromosome;')
    contigs_sizes_lines = []
    total_len = 0
    min_contig_size = qconfig.min_contig
    too_many_contigs = False
    has_aligned_contigs = structures_by_labels.values()
    for assembly in contigs_by_assemblies:
        contigs_sizes_str.append('contig_data["' + assembly + '"] = [ ')
        cum_length = 0
        contigs = sorted(contigs_by_assemblies[assembly], key=lambda x: x.size, reverse=True)
        last_contig_num = min(len(contigs), qconfig.max_contigs_num_for_size_viewer)
        contig_threshold = contigs[last_contig_num - 1].size
        not_used_nx = [nx for nx in nx_marks]
        if len(contigs) > qconfig.max_contigs_num_for_size_viewer:
            too_many_contigs = True

        for i, alignment in enumerate(contigs):
            if i >= last_contig_num:
                break
            cum_length, contigs_sizes_lines, align, not_used_nx = add_contig(cum_length, alignment, not_used_nx, assemblies_n50,
                                                                assembly, contigs, contigs_sizes_lines, i, structures_by_labels,
                                                                             has_aligned_contigs=has_aligned_contigs)
            contigs_sizes_str.append(align)
        if len(contigs) > qconfig.max_contigs_num_for_size_viewer:
            assembly_len = cum_length
            remained_len = sum(alignment.size for alignment in contigs[last_contig_num:])
            cum_length += remained_len
            remained_genes = ['{start:' + str(gene.start) + ',end:' + str(gene.end) + '}' for contig in contigs[last_contig_num:]
                              for gene in contig.genes]
            remained_contigs_name = str(len(contigs) - last_contig_num) + ' hidden contigs shorter than ' + str(contig_threshold) + \
                                    ' bp (total length: ' + format_long_numbers(remained_len) + ' bp)'
            contigs_sizes_str.append(('{name:"' + remained_contigs_name + '", size:' + str(remained_len) +
                                     ', contig_type:"short_contigs"' + (',genes:[' + ','.join(remained_genes) + ']},'
                                      if qconfig.gene_finding else '},')))
        if not_used_nx and last_contig_num < len(contigs):
            for i, alignment in enumerate(contigs[last_contig_num:]):
                if not not_used_nx:
                    break
                assembly_len, contigs_sizes_lines, align, not_used_nx = add_contig(assembly_len, alignment, not_used_nx, assemblies_n50,
                                                                    assembly, contigs, contigs_sizes_lines, last_contig_num + i, structures_by_labels, only_nx=True)
        total_len = max(total_len, cum_length)
        contigs_sizes_str[-1] = contigs_sizes_str[-1][:-1] + '];\n\n'
    contigs_sizes_str = '\n'.join(contigs_sizes_str)
    contigs_sizes_str += 'var contigLines = [' + ','.join(contigs_sizes_lines) + '];\n\n'
    contigs_sizes_str += 'var contigs_total_len = ' + str(total_len) + ';\n'
    contigs_sizes_str += 'var minContigSize = ' + str(min_contig_size) + ';'
    contig_viewer_data = contigs_sizes_str + '\n'.join(additional_data)
    return contig_viewer_data, too_many_contigs
Beispiel #3
0
def js_data_gen(assemblies, contigs_fpaths, chromosomes_length, output_dirpath, structures_by_labels,
                contigs_by_assemblies, ambiguity_alignments_by_labels=None, contig_names_by_refs=None, ref_fpath=None,
                stdout_pattern=None, features_data=None, cov_fpath=None, physical_cov_fpath=None, json_output_dir=None):
    chr_names = []
    if chromosomes_length and assemblies:
        chr_to_aligned_blocks = OrderedDict()
        chr_names = list(chromosomes_length.keys())
        for assembly in assemblies.assemblies:
            chr_to_aligned_blocks[assembly.label] = defaultdict(list)
            similar_correct = 0
            similar_misassembled = 0

            for align in assembly.alignments:
                chr_to_aligned_blocks[assembly.label][align.ref_name].append(align)
                if align.similar:
                    if align.misassembled:
                        similar_misassembled += 1
                    else:
                        similar_correct += 1
            report = reporting.get(assembly.fpath)
            report.add_field(reporting.Fields.SIMILAR_CONTIGS, similar_correct)
            report.add_field(reporting.Fields.SIMILAR_MIS_BLOCKS, similar_misassembled)

    main_menu_fpath = os.path.join(output_dirpath, qconfig.icarus_html_fname)
    output_all_files_dir_path = os.path.join(output_dirpath, qconfig.icarus_dirname)
    if not os.path.exists(output_all_files_dir_path):
        os.mkdir(output_all_files_dir_path)

    chr_full_names, contig_names_by_refs = group_references(chr_names, contig_names_by_refs, chromosomes_length, ref_fpath)

    cov_data, not_covered, max_depth = parse_cov_fpath(cov_fpath, chr_names, chr_full_names, contig_names_by_refs)
    physical_cov_data, not_covered, physical_max_depth = parse_cov_fpath(physical_cov_fpath, chr_names, chr_full_names, contig_names_by_refs)

    chr_sizes = {}
    num_contigs = {}
    aligned_bases = genome_analyzer.get_ref_aligned_lengths()
    nx_marks = [reporting.Fields.N50, reporting.Fields.N75, reporting.Fields.NG50, reporting.Fields.NG75]

    assemblies_data, assemblies_contig_size_data, assemblies_n50 = get_assemblies_data(contigs_fpaths, output_all_files_dir_path, stdout_pattern, nx_marks)

    ref_contigs_dict = {}
    chr_lengths_dict = {}

    ref_data = 'var references_by_id = {};\n'
    chr_names_by_id = dict((chrom, str(i)) for i, chrom in enumerate(chr_names))
    for chrom, i in chr_names_by_id.items():
        ref_data += 'references_by_id["' + str(i) + '"] = "' + chrom + '";\n'
    for i, chr in enumerate(chr_full_names):
        if contig_names_by_refs:
            ref_contigs = [contig for contig in chr_names if contig_names_by_refs[contig] == chr]
        elif len(chr_full_names) == 1:
            ref_contigs = chr_names
        else:
            ref_contigs = [chr]
        ref_contigs_dict[chr] = ref_contigs
        chr_lengths_dict[chr] = [0] + [chromosomes_length[contig] for contig in ref_contigs]

    num_misassemblies = defaultdict(int)
    aligned_bases_by_chr = defaultdict(list)
    aligned_assemblies = defaultdict(set)
    for i, chr in enumerate(chr_full_names):
        ref_contigs = ref_contigs_dict[chr]
        chr_lengths = chr_lengths_dict[chr]
        chr_size = sum([chromosomes_length[contig] for contig in ref_contigs])
        chr_sizes[chr] = chr_size
        num_contigs[chr] = len(ref_contigs)
        data_str = []
        data_str.append('var chromosomes_len = {};')
        for ref_contig in ref_contigs:
            l = chromosomes_length[ref_contig]
            data_str.append('chromosomes_len["' + ref_contig + '"] = ' + str(l) + ';')
            aligned_bases_by_chr[chr].extend(aligned_bases[ref_contig])

        cov_data_str = format_cov_data(cov_data, max_depth, chr, 'coverage_data', 'reads_max_depth') if cov_data else None
        physical_cov_data_str = format_cov_data(physical_cov_data, physical_max_depth, chr, 'physical_coverage_data', 'physical_max_depth') \
            if physical_cov_data else None

        alignment_viewer_fpath, ref_data_str, contigs_structure_str, additional_assemblies_data, ms_selectors, num_misassemblies[chr], aligned_assemblies[chr] = \
            prepare_alignment_data_for_one_ref(chr, chr_full_names, chr_names_by_id, ref_contigs, data_str, chr_to_aligned_blocks, structures_by_labels,
                                               contigs_by_assemblies, ambiguity_alignments_by_labels=ambiguity_alignments_by_labels,
                                               cov_data_str=cov_data_str, physical_cov_data_str=physical_cov_data_str,
                                               contig_names_by_refs=contig_names_by_refs, output_dir_path=output_all_files_dir_path)
        ref_name = qutils.name_from_fpath(ref_fpath)
        save_alignment_data_for_one_ref(chr, ref_contigs, ref_name, json_output_dir, alignment_viewer_fpath, ref_data_str, ms_selectors,
                                        ref_data=ref_data, features_data=features_data, assemblies_data=assemblies_data,
                                        contigs_structure_str=contigs_structure_str, additional_assemblies_data=additional_assemblies_data)

    contigs_sizes_str, too_many_contigs = get_contigs_data(contigs_by_assemblies, nx_marks, assemblies_n50, structures_by_labels,
                                                           contig_names_by_refs, chr_names, chr_full_names)
    all_data = assemblies_data + assemblies_contig_size_data + contigs_sizes_str
    save_contig_size_html(output_all_files_dir_path, json_output_dir, too_many_contigs, all_data)

    icarus_links = defaultdict(list)
    if len(chr_full_names) > 1:
        chr_link = qconfig.icarus_html_fname
        icarus_links["links"].append(chr_link)
        icarus_links["links_names"].append(qconfig.icarus_link)

    main_menu_template_fpath = html_saver.get_real_path(qconfig.icarus_menu_template_fname)
    main_data_dict = dict()

    labels = [qconfig.assembly_labels_by_fpath[contigs_fpath] for contigs_fpath in contigs_fpaths]
    main_data_dict['assemblies'] = labels
    html_saver.save_icarus_data(json_output_dir, ', '.join(labels), 'assemblies')

    contig_size_browser_fpath = os.path.join(qconfig.icarus_dirname, qconfig.contig_size_viewer_fname)
    main_data_dict['contig_size_html'] = contig_size_browser_fpath
    html_saver.save_icarus_data(json_output_dir, contig_size_browser_fpath, 'contig_size_html')
    if not chr_names:
        icarus_links["links"].append(contig_size_browser_fpath)
        icarus_links["links_names"].append(qconfig.icarus_link)

    if chr_full_names and (len(chr_full_names) > 1 or qconfig.is_combined_ref):
        main_data_dict['table_references'] = {'references': []}
        num_aligned_assemblies = [len(aligned_assemblies[chr]) for chr in chr_full_names]
        is_unaligned_asm_exists = len(set(num_aligned_assemblies)) > 1
        if is_unaligned_asm_exists:
            main_data_dict['table_references']['th_assemblies'] = True
        for chr in sorted(chr_full_names):
            chr_link, chr_name, chr_genome, chr_size, tooltip = get_info_by_chr(chr, aligned_bases_by_chr, chr_sizes, contigs_fpaths,
                                                                                contig_names_by_refs, one_chromosome=len(chr_full_names) == 1)
            reference_dict = dict()
            reference_dict['chr_link'] = chr_link
            reference_dict['tooltip'] = tooltip
            reference_dict['chr_name'] = os.path.basename(chr_name)
            reference_dict['num_contigs'] = str(num_contigs[chr])
            reference_dict['chr_size'] = format_long_numbers(chr_size)
            if is_unaligned_asm_exists:
                reference_dict['num_assemblies'] = str(len(aligned_assemblies[chr]))
            reference_dict['chr_gf'] = '%.3f' % chr_genome
            reference_dict['num_misassemblies'] = str(num_misassemblies[chr])
            main_data_dict['table_references']['references'].append(reference_dict)
        html_saver.save_icarus_data(json_output_dir, main_data_dict['table_references'], 'table_references', as_text=False)
    else:
        if chr_full_names:
            chr = chr_full_names[0]
            chr_link, chr_name, chr_genome, chr_size, tooltip = get_info_by_chr(chr, aligned_bases_by_chr, chr_sizes, contigs_fpaths,
                                                                                contig_names_by_refs, one_chromosome=True)
            main_data_dict['one_reference'] = dict()
            main_data_dict['one_reference']['alignment_link'] = chr_link
            main_data_dict['one_reference']['ref_fpath'] = os.path.basename(ref_fpath)
            main_data_dict['one_reference']['ref_fragments'] = str(num_contigs[chr])
            main_data_dict['one_reference']['ref_size'] = format_long_numbers(chr_size)
            main_data_dict['one_reference']['ref_gf'] = '%.3f' % chr_genome
            main_data_dict['one_reference']['num_misassemblies'] = str(num_misassemblies[chr])
            icarus_links["links"].append(chr_link)
            icarus_links["links_names"].append(qconfig.icarus_link)
            html_saver.save_icarus_data(json_output_dir, main_data_dict['one_reference'], 'menu_reference', as_text=False)
    html_saver.save_icarus_html(main_menu_template_fpath, main_menu_fpath, main_data_dict)

    html_saver.save_icarus_links(output_dirpath, icarus_links)
    if json_output_dir:
        json_saver.save_icarus_links(json_output_dir, icarus_links)

    return main_menu_fpath