def main(args): ### Get result files paths from SqueezeMeta_conf.pl perlVars = parse_conf_file(args.project_path, override={'$projectdir': args.project_path}) nokegg, nocog, nopfam, doublepass = map(int, [ perlVars['$nokegg'], perlVars['$nocog'], perlVars['$nopfam'], perlVars['$doublepass'] ]) ### Create output dir. try: mkdir(args.output_dir) except OSError as e: if e.errno != 17: raise elif args.sqm2anvio or args.force_overwrite: # We know what we are doing. pass else: print( '\nThe directory {} already exists. Please remove it or use a different output name.\n' .format(args.output_dir)) exit(1) ### Calculate tables and write results. prefix = args.output_dir + '/' + perlVars['$projectname'] + '.' ### Functions if not args.sqm2anvio: # Were custom annotation databases used in this project? methods = [ f.split('.')[-1] for f in listdir(perlVars['$resultpath']) if len(f.split('.')) > 2 and f.split('.')[-2] == 'fun3' ] customMethods = [ method for method in methods if method not in ('kegg', 'cog', 'pfam', 'wranks') ] # Parse ORF table. sampleNames, orfs, kegg, cog, pfam, custom = parse_orf_table( perlVars['$mergedfile'], nokegg, nocog, nopfam, args.trusted_functions, args.ignore_unclassified, customMethods) # Round aggregated functional abundances. # We can have non-integer abundances bc of the way we split counts in ORFs with multiple KEGGs. # We round the aggregates to the closest integer for convenience. kegg['abundances'] = { k: a.round().astype(int) for k, a in kegg['abundances'].items() } cog['abundances'] = { k: a.round().astype(int) for k, a in cog['abundances'].items() } pfam['abundances'] = { k: a.round().astype(int) for k, a in pfam['abundances'].items() } #write_row_dict(sampleNames, orfs['tpm'], prefix + 'orf.tpm.tsv') if not nokegg: write_row_dict(['Name', 'Path'], kegg['info'], prefix + 'KO.names.tsv') write_row_dict(sampleNames, kegg['abundances'], prefix + 'KO.abund.tsv') write_row_dict(sampleNames, kegg['bases'], prefix + 'KO.bases.tsv') write_row_dict(sampleNames, kegg['tpm'], prefix + 'KO.tpm.tsv') if 'copyNumber' in kegg: write_row_dict(sampleNames, kegg['copyNumber'], prefix + 'KO.copyNumber.tsv') if not nocog: write_row_dict(['Name', 'Path'], cog['info'], prefix + 'COG.names.tsv') write_row_dict(sampleNames, cog['abundances'], prefix + 'COG.abund.tsv') write_row_dict(sampleNames, cog['bases'], prefix + 'COG.bases.tsv') write_row_dict(sampleNames, cog['tpm'], prefix + 'COG.tpm.tsv') if 'copyNumber' in cog: write_row_dict(sampleNames, cog['copyNumber'], prefix + 'COG.copyNumber.tsv') write_row_dict(sampleNames, {'COG0468': cog['coverages']['COG0468']}, prefix + 'RecA.tsv') if not nopfam: write_row_dict(sampleNames, pfam['abundances'], prefix + 'PFAM.abund.tsv') write_row_dict(sampleNames, pfam['bases'], prefix + 'PFAM.bases.tsv') write_row_dict(sampleNames, pfam['tpm'], prefix + 'PFAM.tpm.tsv') if 'copyNumber' in pfam: write_row_dict(sampleNames, pfam['copyNumber'], prefix + 'PFAM.copyNumber.tsv') for method, d in custom.items(): write_row_dict(['Name'], d['info'], prefix + method + '.names.tsv') write_row_dict(sampleNames, d['abundances'], prefix + method + '.abund.tsv') write_row_dict(sampleNames, d['bases'], prefix + method + '.bases.tsv') write_row_dict(sampleNames, d['tpm'], prefix + method + '.tpm.tsv') if 'copyNumber' in d: write_row_dict(sampleNames, d['copyNumber'], prefix + method + '.copyNumber.tsv') else: # Not super beautiful code. Just read the orf names and create a fake orf dict # since we need to know the names of all the orfs to create the taxonomy output. orfs = {'abundances': read_orf_names(perlVars['$mergedfile'])} ### Taxonomy. fun_prefix = perlVars['$fun3tax_blastx'] if doublepass else perlVars[ '$fun3tax'] orf_tax, orf_tax_wranks = parse_tax_table(fun_prefix + '.wranks') orf_tax_nofilter, orf_tax_nofilter_wranks = parse_tax_table( fun_prefix + '.noidfilter.wranks') # Add ORFs not present in the input tax file. unclass_list, unclass_list_wranks = parse_tax_string('n_Unclassified') for orf in orfs['abundances']: if orf not in orf_tax: assert orf not in orf_tax_wranks assert orf not in orf_tax_nofilter assert orf not in orf_tax_nofilter_wranks orf_tax[orf] = unclass_list orf_tax_wranks[orf] = unclass_list_wranks orf_tax_nofilter[orf] = unclass_list orf_tax_nofilter_wranks[orf] = unclass_list_wranks orf_tax_prokfilter, orf_tax_prokfilter_wranks = {}, {} for orf in orf_tax: tax = orf_tax[orf] tax_nofilter = orf_tax_nofilter[orf] if 'Bacteria' in (tax[0], tax_nofilter[0]) or 'Archaea' in ( tax[0], tax_nofilter[0]): # We check both taxonomies. orf_tax_prokfilter[orf] = tax orf_tax_prokfilter_wranks[orf] = orf_tax_wranks[orf] else: orf_tax_prokfilter[orf] = tax_nofilter orf_tax_prokfilter_wranks[orf] = orf_tax_nofilter_wranks[orf] contig_abunds, contig_tax, contig_tax_wranks = parse_contig_table( perlVars['$contigtable']) write_row_dict(TAXRANKS, orf_tax, prefix + 'orf.tax.allfilter.tsv') write_row_dict(TAXRANKS, contig_tax, prefix + 'contig.tax.tsv') if not args.sqm2anvio: fna_blastx = perlVars['$fna_blastx'] if doublepass else None write_orf_seqs(orfs['abundances'].keys(), perlVars['$aafile'], fna_blastx, perlVars['$rnafile'], perlVars['$trnafile'] + '.fasta', prefix + 'orf.sequences.tsv') write_contig_seqs(perlVars['$contigsfna'], prefix + 'contig.sequences.tsv') write_row_dict(TAXRANKS, orf_tax_nofilter, prefix + 'orf.tax.nofilter.tsv') write_row_dict(TAXRANKS, orf_tax_prokfilter, prefix + 'orf.tax.prokfilter.tsv') ### Bins if not int(perlVars['$nobins']): bin_tpm, bin_tax, bin_tax_wranks = parse_bin_table( perlVars['$bintable']) write_row_dict(TAXRANKS, bin_tax, prefix + 'bin.tax.tsv') for idx, rank in enumerate(TAXRANKS): tax_abunds_orfs = aggregate_tax_abunds(orfs['abundances'], orf_tax_prokfilter_wranks, idx) write_row_dict(sampleNames, tax_abunds_orfs, prefix + '{}.prokfilter.abund.tsv'.format(rank)) #write_row_dict(sampleNames, normalize_abunds(tax_abunds_orfs, 100), prefix + '{}.prokfilter.percent.tsv'.format(rank)) tax_abunds_contigs = aggregate_tax_abunds(contig_abunds, contig_tax_wranks, idx) write_row_dict(sampleNames, tax_abunds_contigs, prefix + '{}.allfilter.abund.tsv'.format(rank))
def main(args): ### Get result files paths from SqueezeMeta_conf.pl perlVars = parse_conf_file(args.project_path) nokegg, nocog, nopfam, doublepass = map(int, [ perlVars['$nokegg'], perlVars['$nocog'], perlVars['$nopfam'], perlVars['$doublepass'] ]) ### Create output dir. try: mkdir(args.output_dir) except OSError as e: if e.errno != 17: raise ### Calculate tables and write results. prefix = args.output_dir + '/' + perlVars['$projectname'] + '.' sampleNames, orfs, kegg, cog, pfam = parse_orf_table( perlVars['$mergedfile'], nokegg, nocog, nopfam, args.trusted_functions, args.ignore_unclassified) # Round aggregated functional abundances. # We can have non-integer abundances bc of the way we split counts in ORFs with multiple KEGGs. # We round the aggregates to the closest integer for convenience. kegg['abundances'] = { k: a.round().astype(int) for k, a in kegg['abundances'].items() } cog['abundances'] = { k: a.round().astype(int) for k, a in cog['abundances'].items() } pfam['abundances'] = { k: a.round().astype(int) for k, a in pfam['abundances'].items() } #write_results(sampleNames, orfs['tpm'], prefix + 'orf.tpm.tsv') if not nokegg: write_results(sampleNames, kegg['abundances'], prefix + 'KO.abund.tsv') write_results(sampleNames, kegg['tpm'], prefix + 'KO.tpm.tsv') if not nocog: write_results(sampleNames, cog['abundances'], prefix + 'COG.abund.tsv') write_results(sampleNames, cog['tpm'], prefix + 'COG.tpm.tsv') if not nopfam: write_results(sampleNames, pfam['abundances'], prefix + 'PFAM.abund.tsv') write_results(sampleNames, pfam['tpm'], prefix + 'PFAM.tpm.tsv') fun_prefix = perlVars['$fun3tax_blastx'] if doublepass else perlVars[ '$fun3tax'] orf_tax, orf_tax_wranks = parse_tax_table(fun_prefix + '.wranks') orf_tax_nofilter, orf_tax_nofilter_wranks = parse_tax_table( fun_prefix + '.noidfilter.wranks') # Add ORFs not present in the input tax file. unclass_list = ['Unclassified' for rank in TAXRANKS_SHORT] unclass_list_wranks = ['{}_Unclassified' for rank in TAXRANKS_SHORT] for orf in orfs['abundances']: if orf not in orf_tax: assert orf not in orf_tax_wranks assert orf not in orf_tax_nofilter assert orf not in orf_tax_nofilter_wranks orf_tax[orf] = unclass_list orf_tax_wranks[orf] = unclass_list_wranks orf_tax_nofilter[orf] = unclass_list orf_tax_nofilter_wranks[orf] = unclass_list_wranks orf_tax_prokfilter, orf_tax_prokfilter_wranks = {}, {} for orf in orf_tax: tax = orf_tax[orf] tax_nofilter = orf_tax_nofilter[orf] if 'Bacteria' in (tax[0], tax_nofilter[0]) or 'Archaea' in ( tax[0], tax_nofilter[0]): # We check both taxonomies. orf_tax_prokfilter[orf] = tax orf_tax_prokfilter_wranks[orf] = orf_tax_wranks[orf] else: orf_tax_prokfilter[orf] = tax_nofilter orf_tax_prokfilter_wranks[orf] = orf_tax_nofilter_wranks[orf] contig_abunds, contig_tax, contig_tax_wranks = parse_contig_table( perlVars['$contigtable']) if args.write_parsed_tax: write_results(TAXRANKS, orf_tax, prefix + 'orf.tax.allfilter.tsv') write_results(TAXRANKS, orf_tax_nofilter, prefix + 'orf.tax.nofilter.tsv') write_results(TAXRANKS, orf_tax_prokfilter, prefix + 'orf.tax.prokfilter.tsv') write_results(TAXRANKS, contig_tax, prefix + 'contig.tax.tsv') for idx, rank in enumerate(TAXRANKS): tax_abunds_orfs = aggregate_tax_abunds(orfs['abundances'], orf_tax_prokfilter, idx) write_results(sampleNames, tax_abunds_orfs, prefix + '{}.prokfilter.abund.orfs.tsv'.format(rank)) tax_abunds_contigs = aggregate_tax_abunds(contig_abunds, contig_tax_wranks, idx) write_results(sampleNames, tax_abunds_contigs, prefix + '{}.allfilter.abund.tsv'.format(rank)) write_results(sampleNames, normalize_abunds(tax_abunds_contigs, 100), prefix + '{}.allfilter.percent.tsv'.format(rank))