Ejemplo n.º 1
0
def make_taxonomy_series_chart(tax_profile,
                               sample_list,
                               outfile,
                               krona_path,
                               metric='efpkg'):
    """Writes XML file for taxonomy chart of multiple samples and generates Krona plot for it.
    Taxonomy profile must have two-level attributes, with function identifier as outer key and
    a metric as inner key.

    Args:
        tax_profile (:obj:TaxonomyProfile): taxonomy profile object
        sample_list (list of str): sample identifiers
        outfile (str): path for XML output
        krona_path (str): Krona Tools command
        metric (str): scoring metric (efpkg by default)
    """
    with open(outfile, 'w') as out:
        # Write header
        out.write('<krona key="false">\n')
        out.write('\t<attributes magnitude="' + metric + '">\n')
        if metric == 'proteincount':
            out.write('\t\t<attribute display="Protein count">' + metric +
                      '</attribute>\n')
        else:
            out.write(
                '\t\t<attribute display="Read count">readcount</attribute>\n')
        if metric != 'readcount' and metric != 'proteincount':
            out.write('\t\t<attribute display="Score:' + metric + '">' +
                      metric + '</attribute>\n')
        out.write(
            '\t\t<attribute display="AAI %" mono="true">identity</attribute>\n'
        )
        out.write('\t</attributes>\n')
        out.write(
            '\t<color attribute="identity" valueStart="50" valueEnd="100" hueStart="0" '
            + 'hueEnd="240" default="true"></color>\n')
        # Write dataset
        out.write('\t<datasets>\n')
        for sample in sample_list:
            out.write('\t\t<dataset>' + sample + '</dataset>\n')
        out.write('\t</datasets>\n')
        # Write nodes
        offset = 1
        child_nodes, _ = get_lca_dataseries_tax_xml(tax_profile,
                                                    sample_list,
                                                    ROOT_TAXONOMY_ID,
                                                    offset,
                                                    metric=metric)
        out.write(child_nodes)
        # Close XML
        out.write('</krona>')

    # Run Krona
    html_file = outfile + '.html'
    krona_cmd = [krona_path, '-o', html_file, outfile]
    run_external_program(krona_cmd)
def run_bgr_search(parser, command):
    """Runs classification DIAMOND search

    Args:
        parser (:obj:DiamondParser): parser object processing an input sequence file
        command (str): either 'blastx' or 'blastp' (see DIAMOND manual)
    """
    print('Starting DIAMOND')
    diamond_args = [
        parser.config.diamond_path,
        command,
        '--db',
        parser.config.get_background_diamond_db(
            parser.options.get_collection(parser.sample.sample_id)),
        '--query',
        os.path.join(
            parser.options.get_project_dir(parser.sample.sample_id),
            parser.sample.sample_id + '_' + parser.end + '_' +
            parser.options.ref_hits_fastq_name),
        '--out',
        os.path.join(
            parser.options.get_project_dir(parser.sample.sample_id),
            parser.sample.sample_id + '_' + parser.end + '_' +
            parser.options.background_output_name),
        '--max-target-seqs',
        '100',
        '--evalue',
        str(
            parser.config.get_background_db_size(
                parser.options.get_collection(parser.sample.sample_id)) *
            parser.config.get_evalue_cutoff(
                parser.options.get_collection(parser.sample.sample_id)) /
            parser.config.get_reference_db_size(
                parser.options.get_collection(parser.sample.sample_id))),
        # '--threads',
        # parser.config.threads,
        '--outfmt',
        '6',
        'qseqid',
        'sseqid',
        'pident',
        'length',
        'mismatch',
        'slen',
        'qstart',
        'qend',
        'sstart',
        'send',
        'evalue',
        'bitscore'
    ]
    run_external_program(diamond_args)
    print('DIAMOND finished')
def run_ref_search(parser, command, options=None):
    """Runs pre-selection DIAMOND search

    Args:
        parser (:obj:DiamondParser): parser object processing an input sequence file
        command (str): either 'blastx' or 'blastp' (see DIAMOND manual)
    """
    print('Starting DIAMOND')
    diamond_args = [parser.config.diamond_path,
                    command]
    if options is not None:
        diamond_args = diamond_args + options
    diamond_args = diamond_args + ['--db',
                                   parser.config.get_reference_diamond_db(
                                       parser.options.get_collection(parser.sample.sample_id)
                                   ),
                                   '--query',
                                   parser.options.get_fastq_path(
                                       parser.sample.sample_id, parser.end
                                   ),
                                   '--out',
                                   os.path.join(
                                       parser.options.get_project_dir(parser.sample.sample_id),
                                       parser.sample.sample_id + '_' + parser.end + '_'
                                       + parser.options.ref_output_name
                                   ),
                                   '--max-target-seqs',
                                   '50',
                                   '--evalue',
                                   str(parser.config.get_evalue_cutoff(
                                       parser.options.get_collection(parser.sample.sample_id)
                                   )),
                                   '--outfmt', '6', 'qseqid', 'sseqid', 'pident', 'length',
                                   'mismatch', 'slen', 'qstart', 'qend', 'sstart', 'send',
                                   'evalue', 'bitscore']
    run_external_program(diamond_args)
    print('DIAMOND finished')
Ejemplo n.º 4
0
def make_assembly_taxonomy_chart(tax_profile,
                                 genes,
                                 function_list,
                                 outfile,
                                 krona_path,
                                 metric='efpkg'):
    """Writes XML file for taxonomy chart of assembly, one chart for all reads and separate charts
       for each function and generates Krona plot from it

    Args:
        tax_profile (:obj:TaxonomyProfile): taxonomy profile object
        genes (defaultdict[str,defaultdict[str,dict[str,float]]]): outer key is
            gene identifier, middle key is function identifier, inner key is in
            [metric, 'count', 'identity', 'coverage', 'Length', 'Completeness'],
            value is float (genes[gene_id][function_id][parameter_name] = parameter_value).
        function_list (list of str): function identifiers
        outfile (str): path for XML output
        krona_path (str): Krona Tools command
        metric (str): scoring metric (efpkg by default)
    """
    # genes contains gene data:, genes[gene_id][function][parameter] = parameter_value
    with open(outfile, 'w') as out:
        # Write header
        out.write('<krona key="false">\n')
        out.write('\t<attributes magnitude="' + metric + '">\n')
        if metric != 'readcount':
            out.write(
                '\t\t<attribute display="Read count">readcount</attribute>\n')
        out.write('\t\t<attribute display="Score:' + metric + '">' + metric +
                  '</attribute>\n')
        out.write(
            '\t\t<attribute display="Coverage" mono="true">coverage</attribute>\n'
        )
        out.write(
            '\t\t<attribute display="Length" mono="true">Length</attribute>\n')
        out.write(
            '\t\t<attribute display="CDS completeness %" mono="true">Completeness'
            + '</attribute>\n')
        out.write(
            '\t\t<attribute display="Best hit identity %" mono="true">identity</attribute>\n'
        )
        # Obsolete
        out.write(
            '\t\t<attribute display="UniRef hit" hrefbase="https://www.uniprot.org/uniref/" '
            + 'target="uniref" mono="true">best_hit</attribute>\n')
        out.write('\t</attributes>\n')
        out.write(
            '\t<color attribute="identity" valueStart="50" valueEnd="100" hueStart="0" '
            + 'hueEnd="240" default="true"></color>\n')
        # Write dataset
        out.write('\t<datasets>\n')
        for function in function_list:
            out.write('\t\t<dataset>' + function + '</dataset>\n')
        out.write('\t</datasets>\n')
        # Write nodes
        offset = 1
        out.write(
            get_assembly_tax_xml(tax_profile, genes, function_list,
                                 ROOT_TAXONOMY_ID, offset, metric))
        # Close XML
        out.write('</krona>')

    # Run Krona
    html_file = outfile + '.html'
    krona_cmd = [krona_path, '-o', html_file, outfile]
    run_external_program(krona_cmd)
Ejemplo n.º 5
0
def make_functions_chart(parser, metric='efpkg'):
    """Writes XML file for functions chart and generates Krona plot from it

    Args:
        parser (:obj:DiamondParser): parser object with annotated reads
        metric (str): scoring metric (efpkg by default)
    """
    outfile = os.path.join(
        parser.options.get_project_dir(parser.sample.sample_id),
        parser.options.get_output_subdir(parser.sample.sample_id),
        parser.sample.sample_id + '_' + parser.end + '_' +
        parser.options.xml_name)
    with open(outfile, 'w') as out:
        # Write header
        if metric == 'proteincount':
            metric = 'readcount'
            out.write('<krona key="false">\n' + '\t<attributes magnitude="' +
                      metric + '">\n' +
                      '\t\t<attribute display="Protein count">' + metric +
                      '</attribute>\n')
        else:
            out.write(
                '<krona key="false">\n' + '\t<attributes magnitude="' +
                metric + '">\n' +
                '\t\t<attribute display="Read count">readcount</attribute>\n')
        if metric != 'readcount':
            out.write('\t\t<attribute display="Score:' + metric + '">' +
                      metric + '</attribute>\n')
        out.write(
            '\t\t<attribute display="AAI %" mono="true">identity</attribute>\n'
            + '\t</attributes>\n' + ' '.join([
                '\t<color attribute="identity"', 'valueStart="50"',
                'valueEnd="100"', 'hueStart="0"', 'hueEnd="240"',
                'default="true"></color>\n'
            ])
            # Write dataset
            + '\t<datasets>\n\t\t<dataset>' + parser.sample.sample_id +
            '</dataset>\n\t</datasets>\n')

        read_count = 0
        total_rpkm = 0.0
        groups_rpkm = defaultdict(float)
        groups_counts = defaultdict(set)
        groups_identity = defaultdict(list)
        functions_counts = defaultdict(set)
        functions_rpkm = defaultdict(float)
        functions_identity = defaultdict(list)
        for _, read in parser.reads.items():
            if read.status == STATUS_GOOD:
                read_count += 1
                for function in read.functions:
                    total_rpkm += read.functions[function]
                    groups_counts[parser.ref_data.lookup_function_group(
                        function)].add(read.read_id)
                    functions_rpkm[function] += read.functions[function]
                    groups_rpkm[parser.ref_data.lookup_function_group(function)] += \
                        read.functions[function]
                    functions_counts[function].add(read.read_id)
                for hit in read.hit_list.hits:
                    for function in hit.functions:
                        functions_identity[function].append(hit.identity)
                        groups_identity[parser.ref_data.lookup_function_group(function)]\
                            .append(hit.identity)

        # Write nodes
        # Write top-level node
        out.write('\t<node name="' + parser.sample.sample_id + '_' +
                  parser.end + '">\n')
        if metric != 'readcount':
            out.write('\t\t<readcount><val>' + str(read_count) +
                      '</val></readcount>\n')
        out.write('\t\t<' + metric + '><val>' + str(total_rpkm) + '</val></' +
                  metric + '>\n')

        for group in groups_rpkm:
            # Write group-level node
            out.write('\t\t<node name="' + group + '">\n')
            if metric != 'readcount':
                out.write('\t\t\t<readcount><val>' +
                          str(len(groups_counts[group])) +
                          '</val></readcount>\n')
            out.write('\t\t\t<' + metric + '><val>' + str(groups_rpkm[group]) +
                      '</val></' + metric + '>\n')
            if group in groups_identity:
                out.write('\t\t\t<identity><val>' + str(
                    sum(groups_identity[group]) /
                    len(groups_identity[group])) + '</val></identity>\n')
            else:
                out.write('\t\t\t<identity><val>0.0</val></identity>\n')
            for function in parser.ref_data.get_functions_in_group(group):
                if function in functions_rpkm:
                    # Write function-level node
                    out.write('\t\t\t<node name="' + function + '">\n')
                    if metric != 'readcount':
                        out.write('\t\t\t\t<readcount><val>' +
                                  str(len(functions_counts[function])) +
                                  '</val></readcount>\n')
                    out.write('\t\t\t\t<' + metric + '><val>' +
                              str(functions_rpkm[function]) + '</val></' +
                              metric + '>\n')
                    if function in functions_identity:
                        out.write('\t\t\t\t<identity><val>' + str(
                            sum(functions_identity[function]) /
                            len(functions_identity[function])) +
                                  '</val></identity>\n')
                    else:
                        out.write(
                            '\t\t\t\t<identity><val>0.0</val></identity>\n')
                    out.write('\t\t\t</node>\n')
            # Close group-level node
            out.write('\t\t</node>\n')
        # Close top-level node
        out.write('\t</node>\n</krona>')
    # Run Krona
    html_file = os.path.join(
        parser.options.get_project_dir(parser.sample.sample_id),
        parser.options.get_output_subdir(parser.sample.sample_id),
        parser.sample.sample_id + '_' + parser.end + '_' +
        parser.options.html_name)
    run_external_program([parser.config.krona_path, '-o', html_file, outfile])