def run_mapper(functions, output_dir, mapper_command, is_paired_end=True): """Runs Bowtie2 mapper on filtered contigs""" mapper_command = 'bowtie2' for function in functions: if not os.path.exists(os.path.join(output_dir, function, 'final.contigs.filtered.fa')): continue if os.path.getsize(os.path.join(output_dir, function, 'final.contigs.filtered.fa')) > 0: print('Run read mapping for function', function) if is_paired_end: mapper_args = [mapper_command, '-q', '--very-sensitive', '--quiet', '-x', os.path.join(output_dir, function, 'index', 'index'), '-1', os.path.join(output_dir, function + '_pe1.fastq'), '-2', os.path.join(output_dir, function + '_pe2.fastq'), '>' + os.path.join(output_dir, function, 'contigs.sam')] else: mapper_args = [mapper_command, '-q', '--very-sensitive', '--quiet', '-x', os.path.join(output_dir, function, 'index', 'index'), '-U', os.path.join(output_dir, function + '_pe1.fastq'), '>' + os.path.join(output_dir, function, 'contigs.sam')] run_external_program(mapper_args)
def run_bgr_search(project): """Runs DIAMOND classification search on predicted genes""" print('Starting DIAMOND') diamond_args = [project.config.diamond_path, 'blastp', '--db', project.config.get_background_diamond_db(project.options.get_collection()), '--query', os.path.join( project.options.assembly_dir, 'all_contigs_' + project.options.ref_hits_fastq_name ), '--out', os.path.join( project.options.assembly_dir, 'all_contigs_' + project.options.background_output_name ), '--max-target-seqs', '50', '--evalue', str(project.config.get_background_db_size(project.options.get_collection()) * project.config.get_evalue_cutoff(project.options.get_collection()) / project.config.get_reference_db_size(project.options.get_collection())), '--threads', project.config.threads, '--outfmt', '6', 'qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'slen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore'] run_external_program(diamond_args) print('DIAMOND finished')
def make_taxonomy_series_chart(tax_profile, sample_list, outfile, krona_path, metric='efpkg'): """Writes XML file for taxonomy chart of multiple samples and generates Krona plot for it. Taxonomy profile must have two-level attributes, with function identifier as outer key and a metric as inner key. Args: tax_profile (:obj:TaxonomyProfile): taxonomy profile object sample_list (list of str): sample identifiers outfile (str): path for XML output krona_path (str): Krona Tools command metric (str): scoring metric (efpkg by default) """ with open(outfile, 'w') as out: # Write header out.write('<krona key="false">\n') out.write('\t<attributes magnitude="' + metric + '">\n') if metric == 'proteincount': out.write('\t\t<attribute display="Protein count">' + metric + '</attribute>\n') else: out.write( '\t\t<attribute display="Read count">readcount</attribute>\n') if metric != 'readcount' and metric != 'proteincount': out.write('\t\t<attribute display="Score:' + metric + '">' + metric + '</attribute>\n') out.write( '\t\t<attribute display="AAI %" mono="true">identity</attribute>\n' ) out.write('\t</attributes>\n') out.write( '\t<color attribute="identity" valueStart="50" valueEnd="100" hueStart="0" ' + 'hueEnd="240" default="true"></color>\n') # Write dataset out.write('\t<datasets>\n') for sample in sample_list: out.write('\t\t<dataset>' + sample + '</dataset>\n') out.write('\t</datasets>\n') # Write nodes offset = 1 child_nodes, _ = get_lca_dataseries_tax_xml(tax_profile, sample_list, ROOT_TAXONOMY_ID, offset, metric=metric) out.write(child_nodes) # Close XML out.write('</krona>') # Run Krona html_file = outfile + '.html' krona_cmd = [krona_path, '-o', html_file, outfile] run_external_program(krona_cmd)
def run_prodigal(infile, outfile, prodigal_path): """Runs Prodigal gene prediction on filtered contigs""" print('Starting Prodigal') prodigal_args = [prodigal_path, '-p', 'meta', '-a', outfile, '-i', infile, '-o', outfile+'prodigal.txt'] run_external_program(prodigal_args) print('Prodigal finished')
def run_trimmomatic(file1, file2, sample_id, working_directory, threads): """Runs trimmomatic for one or two files Args: file1 (str): path to input file 1 (FASTQ) file2 (str): path to input file 2 (FASTQ paired-end) or None sample_id (str): sample identifier working_directory: directory where Trimmomatic will write output files threads (str): number of threads Returns: outfile1 (str): path to FASTQ file with trimmed paired-end1 reads outfile2 (str): path to FASTQ file with trimmed paired-end2 reads """ print('Starting Trimmomatic') outfile1 = '' outfile2 = '' if file2 == '': outfile1 = os.path.join(working_directory, sample_id + '_SE.fastq.gz') trimmomatic_args = ['TrimmomaticSE', '-threads', threads, '-phred33', file1, outfile1, 'ILLUMINACLIP:/usr/share/trimmomatic/TruSeq3-PE.fa:2:30:10', 'LEADING:3', 'TRAILING:3', 'SLIDINGWINDOW:4:14', 'MINLEN:50'] run_external_program(trimmomatic_args) print('Trimmomatic finished') else: trimmomatic_args = ['TrimmomaticPE', '-threads', threads, '-phred33', file1, file2, '-baseout', os.path.join(working_directory, sample_id + '.fastq.gz'), 'ILLUMINACLIP:/usr/share/trimmomatic/TruSeq3-PE.fa:2:30:10', 'LEADING:3', 'TRAILING:3', 'SLIDINGWINDOW:4:14', 'MINLEN:50'] run_external_program(trimmomatic_args) print('Trimmomatic finished') outfile1 = os.path.join(working_directory, sample_id + '_1P.fastq.gz') outfile2 = os.path.join(working_directory, sample_id + '_2P.fastq.gz') return outfile1, outfile2
def run_mapper_indexing(functions, output_dir, mapper_command): """Runs Bowtie2 indexer on filtered contigs""" mapper_command = 'bowtie2-build' for function in functions: if not os.path.exists(os.path.join(output_dir, function, 'final.contigs.filtered.fa')): print('Contigs file for function', function, 'not found') continue print('Run indexing for function', function) if os.path.getsize(os.path.join(output_dir, function, 'final.contigs.filtered.fa')) > 0: if not os.path.exists(os.path.join(output_dir, function, 'index')): os.mkdir(os.path.join(output_dir, function, 'index')) mapper_args = [mapper_command, '-f', os.path.join(output_dir, function, 'final.contigs.filtered.fa'), os.path.join(output_dir, function, 'index', 'index')] run_external_program(mapper_args)
def run_bgr_search(parser, command, options=None): """Runs classification DIAMOND search Args: parser (:obj:DiamondParser): parser object processing an input sequence file command (str): either 'blastx' or 'blastp' (see DIAMOND manual) """ print('Starting DIAMOND') diamond_args = [parser.config.diamond_path, command] if options is not None: diamond_args = diamond_args + options diamond_args = diamond_args + ['--db', parser.config.get_background_diamond_db( parser.options.get_collection(parser.sample.sample_id) ), '--query', os.path.join( parser.options.get_project_dir(parser.sample.sample_id), parser.sample.sample_id + '_' + parser.end + '_' + parser.options.ref_hits_fastq_name ), '--out', os.path.join( parser.options.get_project_dir(parser.sample.sample_id), parser.sample.sample_id + '_' + parser.end + '_' + parser.options.background_output_name ), '--max-target-seqs', '100', '--evalue', str( parser.config.get_background_db_size( parser.options.get_collection(parser.sample.sample_id) ) * parser.config.get_evalue_cutoff( parser.options.get_collection(parser.sample.sample_id) ) / parser.config.get_reference_db_size( parser.options.get_collection(parser.sample.sample_id) )), '--threads', parser.config.threads, '--outfmt', '6', 'qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'slen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore'] run_external_program(diamond_args) print('DIAMOND finished')
def run_megahit(functions, output_dir, assembler_command, is_paired_end=True): """Runs MEGAHIT assembler on exported reads""" print('Starting assembly') for function in functions: print('Run assembler for function', function) if is_paired_end: assembler_args = [assembler_command, '-1', os.path.join(output_dir, function + '_pe1.fastq'), '-2', os.path.join(output_dir, function + '_pe2.fastq'), '-o', os.path.join(output_dir, function)] else: assembler_args = [assembler_command, '-r', os.path.join(output_dir, function + '_pe1.fastq'), '-o', os.path.join(output_dir, function)] run_external_program(assembler_args) print('Assembler finished for function ', function) print('Assembly finished')
def make_assembly_taxonomy_chart(tax_profile, genes, function_list, outfile, krona_path, metric='efpkg'): """Writes XML file for taxonomy chart of assembly, one chart for all reads and separate charts for each function and generates Krona plot from it Args: tax_profile (:obj:TaxonomyProfile): taxonomy profile object genes (defaultdict[str,defaultdict[str,dict[str,float]]]): outer key is gene identifier, middle key is function identifier, inner key is in [metric, 'count', 'identity', 'coverage', 'Length', 'Completeness'], value is float (genes[gene_id][function_id][parameter_name] = parameter_value). function_list (list of str): function identifiers outfile (str): path for XML output krona_path (str): Krona Tools command metric (str): scoring metric (efpkg by default) """ # genes contains gene data:, genes[gene_id][function][parameter] = parameter_value with open(outfile, 'w') as out: # Write header out.write('<krona key="false">\n') out.write('\t<attributes magnitude="' + metric + '">\n') if metric != 'readcount': out.write( '\t\t<attribute display="Read count">readcount</attribute>\n') out.write('\t\t<attribute display="Score:' + metric + '">' + metric + '</attribute>\n') out.write( '\t\t<attribute display="Coverage" mono="true">coverage</attribute>\n' ) out.write( '\t\t<attribute display="Length" mono="true">Length</attribute>\n') out.write( '\t\t<attribute display="CDS completeness %" mono="true">Completeness' + '</attribute>\n') out.write( '\t\t<attribute display="Best hit identity %" mono="true">identity</attribute>\n' ) # Obsolete out.write( '\t\t<attribute display="UniRef hit" hrefbase="https://www.uniprot.org/uniref/" ' + 'target="uniref" mono="true">best_hit</attribute>\n') out.write('\t</attributes>\n') out.write( '\t<color attribute="identity" valueStart="50" valueEnd="100" hueStart="0" ' + 'hueEnd="240" default="true"></color>\n') # Write dataset out.write('\t<datasets>\n') for function in function_list: out.write('\t\t<dataset>' + function + '</dataset>\n') out.write('\t</datasets>\n') # Write nodes offset = 1 out.write( get_assembly_tax_xml(tax_profile, genes, function_list, ROOT_TAXONOMY_ID, offset, metric)) # Close XML out.write('</krona>') # Run Krona html_file = outfile + '.html' krona_cmd = [krona_path, '-o', html_file, outfile] run_external_program(krona_cmd)
def make_functions_chart(parser, metric='efpkg'): """Writes XML file for functions chart and generates Krona plot from it Args: parser (:obj:DiamondParser): parser object with annotated reads metric (str): scoring metric (efpkg by default) """ outfile = os.path.join( parser.options.get_project_dir(parser.sample.sample_id), parser.options.get_output_subdir(parser.sample.sample_id), parser.sample.sample_id + '_' + parser.end + '_' + parser.options.xml_name) with open(outfile, 'w') as out: # Write header if metric == 'proteincount': metric = 'readcount' out.write('<krona key="false">\n' + '\t<attributes magnitude="' + metric + '">\n' + '\t\t<attribute display="Protein count">' + metric + '</attribute>\n') else: out.write( '<krona key="false">\n' + '\t<attributes magnitude="' + metric + '">\n' + '\t\t<attribute display="Read count">readcount</attribute>\n') if metric != 'readcount': out.write('\t\t<attribute display="Score:' + metric + '">' + metric + '</attribute>\n') out.write( '\t\t<attribute display="AAI %" mono="true">identity</attribute>\n' + '\t</attributes>\n' + ' '.join([ '\t<color attribute="identity"', 'valueStart="50"', 'valueEnd="100"', 'hueStart="0"', 'hueEnd="240"', 'default="true"></color>\n' ]) # Write dataset + '\t<datasets>\n\t\t<dataset>' + parser.sample.sample_id + '</dataset>\n\t</datasets>\n') read_count = 0 total_rpkm = 0.0 groups_rpkm = defaultdict(float) groups_counts = defaultdict(set) groups_identity = defaultdict(list) functions_counts = defaultdict(set) functions_rpkm = defaultdict(float) functions_identity = defaultdict(list) for _, read in parser.reads.items(): if read.status == STATUS_GOOD: read_count += 1 for function in read.functions: total_rpkm += read.functions[function] groups_counts[parser.ref_data.lookup_function_group( function)].add(read.read_id) functions_rpkm[function] += read.functions[function] groups_rpkm[parser.ref_data.lookup_function_group(function)] += \ read.functions[function] functions_counts[function].add(read.read_id) for hit in read.hit_list.hits: for function in hit.functions: functions_identity[function].append(hit.identity) groups_identity[parser.ref_data.lookup_function_group(function)]\ .append(hit.identity) # Write nodes # Write top-level node out.write('\t<node name="' + parser.sample.sample_id + '_' + parser.end + '">\n') if metric != 'readcount': out.write('\t\t<readcount><val>' + str(read_count) + '</val></readcount>\n') out.write('\t\t<' + metric + '><val>' + str(total_rpkm) + '</val></' + metric + '>\n') for group in groups_rpkm: # Write group-level node out.write('\t\t<node name="' + group + '">\n') if metric != 'readcount': out.write('\t\t\t<readcount><val>' + str(len(groups_counts[group])) + '</val></readcount>\n') out.write('\t\t\t<' + metric + '><val>' + str(groups_rpkm[group]) + '</val></' + metric + '>\n') if group in groups_identity: out.write('\t\t\t<identity><val>' + str( sum(groups_identity[group]) / len(groups_identity[group])) + '</val></identity>\n') else: out.write('\t\t\t<identity><val>0.0</val></identity>\n') for function in parser.ref_data.get_functions_in_group(group): if function in functions_rpkm: # Write function-level node out.write('\t\t\t<node name="' + function + '">\n') if metric != 'readcount': out.write('\t\t\t\t<readcount><val>' + str(len(functions_counts[function])) + '</val></readcount>\n') out.write('\t\t\t\t<' + metric + '><val>' + str(functions_rpkm[function]) + '</val></' + metric + '>\n') if function in functions_identity: out.write('\t\t\t\t<identity><val>' + str( sum(functions_identity[function]) / len(functions_identity[function])) + '</val></identity>\n') else: out.write( '\t\t\t\t<identity><val>0.0</val></identity>\n') out.write('\t\t\t</node>\n') # Close group-level node out.write('\t\t</node>\n') # Close top-level node out.write('\t</node>\n</krona>') # Run Krona html_file = os.path.join( parser.options.get_project_dir(parser.sample.sample_id), parser.options.get_output_subdir(parser.sample.sample_id), parser.sample.sample_id + '_' + parser.end + '_' + parser.options.html_name) run_external_program([parser.config.krona_path, '-o', html_file, outfile])