Beispiel #1
0
def run_mapper(functions, output_dir, mapper_command, is_paired_end=True):
    """Runs Bowtie2 mapper on filtered contigs"""
    mapper_command = 'bowtie2'

    for function in functions:
        if not os.path.exists(os.path.join(output_dir, function, 'final.contigs.filtered.fa')):
            continue
        if os.path.getsize(os.path.join(output_dir, function, 'final.contigs.filtered.fa')) > 0:
            print('Run read mapping for function', function)
            if is_paired_end:
                mapper_args = [mapper_command,
                               '-q',
                               '--very-sensitive',
                               '--quiet',
                               '-x',
                               os.path.join(output_dir, function, 'index', 'index'),
                               '-1',
                               os.path.join(output_dir, function + '_pe1.fastq'),
                               '-2',
                               os.path.join(output_dir, function + '_pe2.fastq'),
                               '>' + os.path.join(output_dir, function, 'contigs.sam')]
            else:
                mapper_args = [mapper_command,
                               '-q',
                               '--very-sensitive',
                               '--quiet',
                               '-x',
                               os.path.join(output_dir, function, 'index', 'index'),
                               '-U',
                               os.path.join(output_dir, function + '_pe1.fastq'),
                               '>' + os.path.join(output_dir, function, 'contigs.sam')]
            run_external_program(mapper_args)
Beispiel #2
0
def run_bgr_search(project):
    """Runs DIAMOND classification search on predicted genes"""
    print('Starting DIAMOND')
    diamond_args = [project.config.diamond_path,
                    'blastp',
                    '--db',
                    project.config.get_background_diamond_db(project.options.get_collection()),
                    '--query',
                    os.path.join(
                        project.options.assembly_dir,
                        'all_contigs_' + project.options.ref_hits_fastq_name
                    ),
                    '--out',
                    os.path.join(
                        project.options.assembly_dir,
                        'all_contigs_' + project.options.background_output_name
                    ),
                    '--max-target-seqs',
                    '50',
                    '--evalue',
                    str(project.config.get_background_db_size(project.options.get_collection())
                        * project.config.get_evalue_cutoff(project.options.get_collection())
                        / project.config.get_reference_db_size(project.options.get_collection())),
                    '--threads',
                    project.config.threads,
                    '--outfmt', '6', 'qseqid', 'sseqid', 'pident', 'length',
                    'mismatch', 'slen', 'qstart', 'qend', 'sstart', 'send',
                    'evalue', 'bitscore']
    run_external_program(diamond_args)
    print('DIAMOND finished')
Beispiel #3
0
def make_taxonomy_series_chart(tax_profile,
                               sample_list,
                               outfile,
                               krona_path,
                               metric='efpkg'):
    """Writes XML file for taxonomy chart of multiple samples and generates Krona plot for it.
    Taxonomy profile must have two-level attributes, with function identifier as outer key and
    a metric as inner key.

    Args:
        tax_profile (:obj:TaxonomyProfile): taxonomy profile object
        sample_list (list of str): sample identifiers
        outfile (str): path for XML output
        krona_path (str): Krona Tools command
        metric (str): scoring metric (efpkg by default)
    """
    with open(outfile, 'w') as out:
        # Write header
        out.write('<krona key="false">\n')
        out.write('\t<attributes magnitude="' + metric + '">\n')
        if metric == 'proteincount':
            out.write('\t\t<attribute display="Protein count">' + metric +
                      '</attribute>\n')
        else:
            out.write(
                '\t\t<attribute display="Read count">readcount</attribute>\n')
        if metric != 'readcount' and metric != 'proteincount':
            out.write('\t\t<attribute display="Score:' + metric + '">' +
                      metric + '</attribute>\n')
        out.write(
            '\t\t<attribute display="AAI %" mono="true">identity</attribute>\n'
        )
        out.write('\t</attributes>\n')
        out.write(
            '\t<color attribute="identity" valueStart="50" valueEnd="100" hueStart="0" '
            + 'hueEnd="240" default="true"></color>\n')
        # Write dataset
        out.write('\t<datasets>\n')
        for sample in sample_list:
            out.write('\t\t<dataset>' + sample + '</dataset>\n')
        out.write('\t</datasets>\n')
        # Write nodes
        offset = 1
        child_nodes, _ = get_lca_dataseries_tax_xml(tax_profile,
                                                    sample_list,
                                                    ROOT_TAXONOMY_ID,
                                                    offset,
                                                    metric=metric)
        out.write(child_nodes)
        # Close XML
        out.write('</krona>')

    # Run Krona
    html_file = outfile + '.html'
    krona_cmd = [krona_path, '-o', html_file, outfile]
    run_external_program(krona_cmd)
Beispiel #4
0
def run_prodigal(infile, outfile, prodigal_path):
    """Runs Prodigal gene prediction on filtered contigs"""
    print('Starting Prodigal')
    prodigal_args = [prodigal_path,
                     '-p',
                     'meta',
                     '-a',
                     outfile,
                     '-i',
                     infile,
                     '-o',
                     outfile+'prodigal.txt']
    run_external_program(prodigal_args)
    print('Prodigal finished')
Beispiel #5
0
def run_trimmomatic(file1, file2, sample_id, working_directory, threads):
    """Runs trimmomatic for one or two files

   Args:
        file1 (str): path to input file 1 (FASTQ)
        file2 (str): path to input file 2 (FASTQ paired-end) or None
        sample_id (str): sample identifier
        working_directory: directory where Trimmomatic will write output files
        threads (str): number of threads

    Returns:
        outfile1 (str): path to FASTQ file with trimmed paired-end1 reads
        outfile2 (str): path to FASTQ file with trimmed paired-end2 reads
    """
    print('Starting Trimmomatic')
    outfile1 = ''
    outfile2 = ''
    if file2 == '':
        outfile1 = os.path.join(working_directory, sample_id + '_SE.fastq.gz')
        trimmomatic_args = ['TrimmomaticSE',
                            '-threads', threads,
                            '-phred33',
                            file1,
                            outfile1,
                            'ILLUMINACLIP:/usr/share/trimmomatic/TruSeq3-PE.fa:2:30:10',
                            'LEADING:3',
                            'TRAILING:3',
                            'SLIDINGWINDOW:4:14',
                            'MINLEN:50']
        run_external_program(trimmomatic_args)
        print('Trimmomatic finished')
    else:
        trimmomatic_args = ['TrimmomaticPE',
                            '-threads', threads,
                            '-phred33',
                            file1,
                            file2,
                            '-baseout', os.path.join(working_directory, sample_id + '.fastq.gz'),
                            'ILLUMINACLIP:/usr/share/trimmomatic/TruSeq3-PE.fa:2:30:10',
                            'LEADING:3',
                            'TRAILING:3',
                            'SLIDINGWINDOW:4:14',
                            'MINLEN:50']
        run_external_program(trimmomatic_args)
        print('Trimmomatic finished')
        outfile1 = os.path.join(working_directory, sample_id + '_1P.fastq.gz')
        outfile2 = os.path.join(working_directory, sample_id + '_2P.fastq.gz')
    return outfile1, outfile2
Beispiel #6
0
def run_mapper_indexing(functions, output_dir, mapper_command):
    """Runs Bowtie2 indexer on filtered contigs"""
    mapper_command = 'bowtie2-build'

    for function in functions:
        if not os.path.exists(os.path.join(output_dir, function, 'final.contigs.filtered.fa')):
            print('Contigs file for function', function, 'not found')
            continue
        print('Run indexing for function', function)
        if os.path.getsize(os.path.join(output_dir, function, 'final.contigs.filtered.fa')) > 0:
            if not os.path.exists(os.path.join(output_dir, function, 'index')):
                os.mkdir(os.path.join(output_dir, function, 'index'))
            mapper_args = [mapper_command,
                           '-f',
                           os.path.join(output_dir, function, 'final.contigs.filtered.fa'),
                           os.path.join(output_dir, function, 'index', 'index')]
            run_external_program(mapper_args)
Beispiel #7
0
def run_bgr_search(parser, command, options=None):
    """Runs classification DIAMOND search

    Args:
        parser (:obj:DiamondParser): parser object processing an input sequence file
        command (str): either 'blastx' or 'blastp' (see DIAMOND manual)
    """
    print('Starting DIAMOND')
    diamond_args = [parser.config.diamond_path,
                    command]
    if options is not None:
        diamond_args = diamond_args + options
    diamond_args = diamond_args + ['--db',
                                   parser.config.get_background_diamond_db(
                                       parser.options.get_collection(parser.sample.sample_id)
                                   ),
                                   '--query',
                                   os.path.join(
                                       parser.options.get_project_dir(parser.sample.sample_id),
                                       parser.sample.sample_id + '_' + parser.end + '_'
                                       + parser.options.ref_hits_fastq_name
                                   ),
                                   '--out',
                                   os.path.join(
                                       parser.options.get_project_dir(parser.sample.sample_id),
                                       parser.sample.sample_id + '_' + parser.end + '_'
                                       + parser.options.background_output_name
                                   ),
                                   '--max-target-seqs',
                                   '100',
                                   '--evalue',
                                   str(
                                       parser.config.get_background_db_size(
                                           parser.options.get_collection(parser.sample.sample_id)
                                       ) * parser.config.get_evalue_cutoff(
                                           parser.options.get_collection(parser.sample.sample_id)
                                       ) / parser.config.get_reference_db_size(
                                           parser.options.get_collection(parser.sample.sample_id)
                                       )),
                                   '--threads',
                                   parser.config.threads,
                                   '--outfmt', '6', 'qseqid', 'sseqid', 'pident', 'length',
                                   'mismatch', 'slen', 'qstart', 'qend', 'sstart', 'send',
                                   'evalue', 'bitscore']
    run_external_program(diamond_args)
    print('DIAMOND finished')
Beispiel #8
0
def run_megahit(functions, output_dir, assembler_command, is_paired_end=True):
    """Runs MEGAHIT assembler on exported reads"""
    print('Starting assembly')
    for function in functions:
        print('Run assembler for function', function)
        if is_paired_end:
            assembler_args = [assembler_command,
                              '-1',
                              os.path.join(output_dir, function + '_pe1.fastq'),
                              '-2',
                              os.path.join(output_dir, function + '_pe2.fastq'),
                              '-o',
                              os.path.join(output_dir, function)]
        else:
            assembler_args = [assembler_command,
                              '-r',
                              os.path.join(output_dir, function + '_pe1.fastq'),
                              '-o',
                              os.path.join(output_dir, function)]
        run_external_program(assembler_args)
        print('Assembler finished for function ', function)
    print('Assembly finished')
Beispiel #9
0
def make_assembly_taxonomy_chart(tax_profile,
                                 genes,
                                 function_list,
                                 outfile,
                                 krona_path,
                                 metric='efpkg'):
    """Writes XML file for taxonomy chart of assembly, one chart for all reads and separate charts
       for each function and generates Krona plot from it

    Args:
        tax_profile (:obj:TaxonomyProfile): taxonomy profile object
        genes (defaultdict[str,defaultdict[str,dict[str,float]]]): outer key is
            gene identifier, middle key is function identifier, inner key is in
            [metric, 'count', 'identity', 'coverage', 'Length', 'Completeness'],
            value is float (genes[gene_id][function_id][parameter_name] = parameter_value).
        function_list (list of str): function identifiers
        outfile (str): path for XML output
        krona_path (str): Krona Tools command
        metric (str): scoring metric (efpkg by default)
    """
    # genes contains gene data:, genes[gene_id][function][parameter] = parameter_value
    with open(outfile, 'w') as out:
        # Write header
        out.write('<krona key="false">\n')
        out.write('\t<attributes magnitude="' + metric + '">\n')
        if metric != 'readcount':
            out.write(
                '\t\t<attribute display="Read count">readcount</attribute>\n')
        out.write('\t\t<attribute display="Score:' + metric + '">' + metric +
                  '</attribute>\n')
        out.write(
            '\t\t<attribute display="Coverage" mono="true">coverage</attribute>\n'
        )
        out.write(
            '\t\t<attribute display="Length" mono="true">Length</attribute>\n')
        out.write(
            '\t\t<attribute display="CDS completeness %" mono="true">Completeness'
            + '</attribute>\n')
        out.write(
            '\t\t<attribute display="Best hit identity %" mono="true">identity</attribute>\n'
        )
        # Obsolete
        out.write(
            '\t\t<attribute display="UniRef hit" hrefbase="https://www.uniprot.org/uniref/" '
            + 'target="uniref" mono="true">best_hit</attribute>\n')
        out.write('\t</attributes>\n')
        out.write(
            '\t<color attribute="identity" valueStart="50" valueEnd="100" hueStart="0" '
            + 'hueEnd="240" default="true"></color>\n')
        # Write dataset
        out.write('\t<datasets>\n')
        for function in function_list:
            out.write('\t\t<dataset>' + function + '</dataset>\n')
        out.write('\t</datasets>\n')
        # Write nodes
        offset = 1
        out.write(
            get_assembly_tax_xml(tax_profile, genes, function_list,
                                 ROOT_TAXONOMY_ID, offset, metric))
        # Close XML
        out.write('</krona>')

    # Run Krona
    html_file = outfile + '.html'
    krona_cmd = [krona_path, '-o', html_file, outfile]
    run_external_program(krona_cmd)
Beispiel #10
0
def make_functions_chart(parser, metric='efpkg'):
    """Writes XML file for functions chart and generates Krona plot from it

    Args:
        parser (:obj:DiamondParser): parser object with annotated reads
        metric (str): scoring metric (efpkg by default)
    """
    outfile = os.path.join(
        parser.options.get_project_dir(parser.sample.sample_id),
        parser.options.get_output_subdir(parser.sample.sample_id),
        parser.sample.sample_id + '_' + parser.end + '_' +
        parser.options.xml_name)
    with open(outfile, 'w') as out:
        # Write header
        if metric == 'proteincount':
            metric = 'readcount'
            out.write('<krona key="false">\n' + '\t<attributes magnitude="' +
                      metric + '">\n' +
                      '\t\t<attribute display="Protein count">' + metric +
                      '</attribute>\n')
        else:
            out.write(
                '<krona key="false">\n' + '\t<attributes magnitude="' +
                metric + '">\n' +
                '\t\t<attribute display="Read count">readcount</attribute>\n')
        if metric != 'readcount':
            out.write('\t\t<attribute display="Score:' + metric + '">' +
                      metric + '</attribute>\n')
        out.write(
            '\t\t<attribute display="AAI %" mono="true">identity</attribute>\n'
            + '\t</attributes>\n' + ' '.join([
                '\t<color attribute="identity"', 'valueStart="50"',
                'valueEnd="100"', 'hueStart="0"', 'hueEnd="240"',
                'default="true"></color>\n'
            ])
            # Write dataset
            + '\t<datasets>\n\t\t<dataset>' + parser.sample.sample_id +
            '</dataset>\n\t</datasets>\n')

        read_count = 0
        total_rpkm = 0.0
        groups_rpkm = defaultdict(float)
        groups_counts = defaultdict(set)
        groups_identity = defaultdict(list)
        functions_counts = defaultdict(set)
        functions_rpkm = defaultdict(float)
        functions_identity = defaultdict(list)
        for _, read in parser.reads.items():
            if read.status == STATUS_GOOD:
                read_count += 1
                for function in read.functions:
                    total_rpkm += read.functions[function]
                    groups_counts[parser.ref_data.lookup_function_group(
                        function)].add(read.read_id)
                    functions_rpkm[function] += read.functions[function]
                    groups_rpkm[parser.ref_data.lookup_function_group(function)] += \
                        read.functions[function]
                    functions_counts[function].add(read.read_id)
                for hit in read.hit_list.hits:
                    for function in hit.functions:
                        functions_identity[function].append(hit.identity)
                        groups_identity[parser.ref_data.lookup_function_group(function)]\
                            .append(hit.identity)

        # Write nodes
        # Write top-level node
        out.write('\t<node name="' + parser.sample.sample_id + '_' +
                  parser.end + '">\n')
        if metric != 'readcount':
            out.write('\t\t<readcount><val>' + str(read_count) +
                      '</val></readcount>\n')
        out.write('\t\t<' + metric + '><val>' + str(total_rpkm) + '</val></' +
                  metric + '>\n')

        for group in groups_rpkm:
            # Write group-level node
            out.write('\t\t<node name="' + group + '">\n')
            if metric != 'readcount':
                out.write('\t\t\t<readcount><val>' +
                          str(len(groups_counts[group])) +
                          '</val></readcount>\n')
            out.write('\t\t\t<' + metric + '><val>' + str(groups_rpkm[group]) +
                      '</val></' + metric + '>\n')
            if group in groups_identity:
                out.write('\t\t\t<identity><val>' + str(
                    sum(groups_identity[group]) /
                    len(groups_identity[group])) + '</val></identity>\n')
            else:
                out.write('\t\t\t<identity><val>0.0</val></identity>\n')
            for function in parser.ref_data.get_functions_in_group(group):
                if function in functions_rpkm:
                    # Write function-level node
                    out.write('\t\t\t<node name="' + function + '">\n')
                    if metric != 'readcount':
                        out.write('\t\t\t\t<readcount><val>' +
                                  str(len(functions_counts[function])) +
                                  '</val></readcount>\n')
                    out.write('\t\t\t\t<' + metric + '><val>' +
                              str(functions_rpkm[function]) + '</val></' +
                              metric + '>\n')
                    if function in functions_identity:
                        out.write('\t\t\t\t<identity><val>' + str(
                            sum(functions_identity[function]) /
                            len(functions_identity[function])) +
                                  '</val></identity>\n')
                    else:
                        out.write(
                            '\t\t\t\t<identity><val>0.0</val></identity>\n')
                    out.write('\t\t\t</node>\n')
            # Close group-level node
            out.write('\t\t</node>\n')
        # Close top-level node
        out.write('\t</node>\n</krona>')
    # Run Krona
    html_file = os.path.join(
        parser.options.get_project_dir(parser.sample.sample_id),
        parser.options.get_output_subdir(parser.sample.sample_id),
        parser.sample.sample_id + '_' + parser.end + '_' +
        parser.options.html_name)
    run_external_program([parser.config.krona_path, '-o', html_file, outfile])