def main(args): ### Get result files paths from SqueezeMeta_conf.pl perlVars = parse_conf_file(args.project_path, override={'$projectdir': args.project_path}) nokegg, nocog, nopfam, doublepass = map(int, [ perlVars['$nokegg'], perlVars['$nocog'], perlVars['$nopfam'], perlVars['$doublepass'] ]) ### Create output dir. try: mkdir(args.output_dir) except OSError as e: if e.errno != 17: raise elif args.sqm2anvio or args.force_overwrite: # We know what we are doing. pass else: print( '\nThe directory {} already exists. Please remove it or use a different output name.\n' .format(args.output_dir)) exit(1) ### Calculate tables and write results. prefix = args.output_dir + '/' + perlVars['$projectname'] + '.' ### Functions if not args.sqm2anvio: # Were custom annotation databases used in this project? methods = [ f.split('.')[-1] for f in listdir(perlVars['$resultpath']) if len(f.split('.')) > 2 and f.split('.')[-2] == 'fun3' ] customMethods = [ method for method in methods if method not in ('kegg', 'cog', 'pfam', 'wranks') ] # Parse ORF table. sampleNames, orfs, kegg, cog, pfam, custom = parse_orf_table( perlVars['$mergedfile'], nokegg, nocog, nopfam, args.trusted_functions, args.ignore_unclassified, customMethods) # Round aggregated functional abundances. # We can have non-integer abundances bc of the way we split counts in ORFs with multiple KEGGs. # We round the aggregates to the closest integer for convenience. kegg['abundances'] = { k: a.round().astype(int) for k, a in kegg['abundances'].items() } cog['abundances'] = { k: a.round().astype(int) for k, a in cog['abundances'].items() } pfam['abundances'] = { k: a.round().astype(int) for k, a in pfam['abundances'].items() } #write_row_dict(sampleNames, orfs['tpm'], prefix + 'orf.tpm.tsv') if not nokegg: write_row_dict(['Name', 'Path'], kegg['info'], prefix + 'KO.names.tsv') write_row_dict(sampleNames, kegg['abundances'], prefix + 'KO.abund.tsv') write_row_dict(sampleNames, kegg['bases'], prefix + 'KO.bases.tsv') write_row_dict(sampleNames, kegg['tpm'], prefix + 'KO.tpm.tsv') if 'copyNumber' in kegg: write_row_dict(sampleNames, kegg['copyNumber'], prefix + 'KO.copyNumber.tsv') if not nocog: write_row_dict(['Name', 'Path'], cog['info'], prefix + 'COG.names.tsv') write_row_dict(sampleNames, cog['abundances'], prefix + 'COG.abund.tsv') write_row_dict(sampleNames, cog['bases'], prefix + 'COG.bases.tsv') write_row_dict(sampleNames, cog['tpm'], prefix + 'COG.tpm.tsv') if 'copyNumber' in cog: write_row_dict(sampleNames, cog['copyNumber'], prefix + 'COG.copyNumber.tsv') write_row_dict(sampleNames, {'COG0468': cog['coverages']['COG0468']}, prefix + 'RecA.tsv') if not nopfam: write_row_dict(sampleNames, pfam['abundances'], prefix + 'PFAM.abund.tsv') write_row_dict(sampleNames, pfam['bases'], prefix + 'PFAM.bases.tsv') write_row_dict(sampleNames, pfam['tpm'], prefix + 'PFAM.tpm.tsv') if 'copyNumber' in pfam: write_row_dict(sampleNames, pfam['copyNumber'], prefix + 'PFAM.copyNumber.tsv') for method, d in custom.items(): write_row_dict(['Name'], d['info'], prefix + method + '.names.tsv') write_row_dict(sampleNames, d['abundances'], prefix + method + '.abund.tsv') write_row_dict(sampleNames, d['bases'], prefix + method + '.bases.tsv') write_row_dict(sampleNames, d['tpm'], prefix + method + '.tpm.tsv') if 'copyNumber' in d: write_row_dict(sampleNames, d['copyNumber'], prefix + method + '.copyNumber.tsv') else: # Not super beautiful code. Just read the orf names and create a fake orf dict # since we need to know the names of all the orfs to create the taxonomy output. orfs = {'abundances': read_orf_names(perlVars['$mergedfile'])} ### Taxonomy. fun_prefix = perlVars['$fun3tax_blastx'] if doublepass else perlVars[ '$fun3tax'] orf_tax, orf_tax_wranks = parse_tax_table(fun_prefix + '.wranks') orf_tax_nofilter, orf_tax_nofilter_wranks = parse_tax_table( fun_prefix + '.noidfilter.wranks') # Add ORFs not present in the input tax file. unclass_list, unclass_list_wranks = parse_tax_string('n_Unclassified') for orf in orfs['abundances']: if orf not in orf_tax: assert orf not in orf_tax_wranks assert orf not in orf_tax_nofilter assert orf not in orf_tax_nofilter_wranks orf_tax[orf] = unclass_list orf_tax_wranks[orf] = unclass_list_wranks orf_tax_nofilter[orf] = unclass_list orf_tax_nofilter_wranks[orf] = unclass_list_wranks orf_tax_prokfilter, orf_tax_prokfilter_wranks = {}, {} for orf in orf_tax: tax = orf_tax[orf] tax_nofilter = orf_tax_nofilter[orf] if 'Bacteria' in (tax[0], tax_nofilter[0]) or 'Archaea' in ( tax[0], tax_nofilter[0]): # We check both taxonomies. orf_tax_prokfilter[orf] = tax orf_tax_prokfilter_wranks[orf] = orf_tax_wranks[orf] else: orf_tax_prokfilter[orf] = tax_nofilter orf_tax_prokfilter_wranks[orf] = orf_tax_nofilter_wranks[orf] contig_abunds, contig_tax, contig_tax_wranks = parse_contig_table( perlVars['$contigtable']) write_row_dict(TAXRANKS, orf_tax, prefix + 'orf.tax.allfilter.tsv') write_row_dict(TAXRANKS, contig_tax, prefix + 'contig.tax.tsv') if not args.sqm2anvio: fna_blastx = perlVars['$fna_blastx'] if doublepass else None write_orf_seqs(orfs['abundances'].keys(), perlVars['$aafile'], fna_blastx, perlVars['$rnafile'], perlVars['$trnafile'] + '.fasta', prefix + 'orf.sequences.tsv') write_contig_seqs(perlVars['$contigsfna'], prefix + 'contig.sequences.tsv') write_row_dict(TAXRANKS, orf_tax_nofilter, prefix + 'orf.tax.nofilter.tsv') write_row_dict(TAXRANKS, orf_tax_prokfilter, prefix + 'orf.tax.prokfilter.tsv') ### Bins if not int(perlVars['$nobins']): bin_tpm, bin_tax, bin_tax_wranks = parse_bin_table( perlVars['$bintable']) write_row_dict(TAXRANKS, bin_tax, prefix + 'bin.tax.tsv') for idx, rank in enumerate(TAXRANKS): tax_abunds_orfs = aggregate_tax_abunds(orfs['abundances'], orf_tax_prokfilter_wranks, idx) write_row_dict(sampleNames, tax_abunds_orfs, prefix + '{}.prokfilter.abund.tsv'.format(rank)) #write_row_dict(sampleNames, normalize_abunds(tax_abunds_orfs, 100), prefix + '{}.prokfilter.percent.tsv'.format(rank)) tax_abunds_contigs = aggregate_tax_abunds(contig_abunds, contig_tax_wranks, idx) write_row_dict(sampleNames, tax_abunds_contigs, prefix + '{}.allfilter.abund.tsv'.format(rank))
def main(args): ### Get result files paths from SqueezeMeta_conf.pl perlVars = parse_conf_file(args.project_path) ### Create output dir. try: mkdir(args.output_dir) except OSError as e: if e.errno != 17: raise else: print( '\nThe directory {} already exists. Please remove it or use a different output name.\n' .format(args.output_dir)) exit(1) ### Create samples file. with open(perlVars['$mappingfile']) as infile, open( '{}/samples.tsv'.format(args.output_dir), 'w') as outfile: outfile.write('Sample\ttest\n') addedSamples = set() for line in infile: sample = line.split('\t')[0].strip( ) # There shouldn't be trailing spaces though... if sample not in addedSamples: addedSamples.add(sample) outfile.write('{}\t{}\n'.format(sample, len(addedSamples))) ### Create orftable. def new2old(str_, orftable=False, bintable=False): """Replace 1.0 headers with old headers, so we don't have to modify and re-deploy SQMdb""" if orftable: str_ = str_.replace('Coverage', 'COVERAGE').replace( 'Raw read count', 'RAW READ COUNT').replace('Raw base count', 'RAW BASE COUNT') else: str_ = str_.replace('Strain het', 'Strain Het').replace('Raw read count', 'Raw') if bintable: str_ = str_.replace('Length', 'Size') return str_ allORFs = [] newFields = [ 'ORF ID', 'Contig ID', 'Length AA', 'GC perc', 'Gene name', 'Tax', 'KEGG ID', 'KEGGFUN', 'KEGGPATH', 'COG ID', 'COGFUN', 'COGPATH', 'PFAM' ] goodFields = [ 'ORF', 'CONTIG ID', 'LENGTH AA', 'GC perc', 'GENNAME', 'TAX ORF', 'KEGG ID', 'KEGGFUN', 'KEGGPATH', 'COG ID', 'COGFUN', 'COGPATH', 'PFAM' ] with open(perlVars['$mergedfile']) as infile, open( '{}/genes.tsv'.format(args.output_dir), 'w') as outfile: outfile.write(infile.readline()) header = infile.readline().strip().split('\t') newFields.extend([ f for f in header if f.startswith('TPM ') or f.startswith('Coverage ') or f.startswith('Raw read ') or f.startswith('Raw base ') ]) goodFields.extend([ new2old(f, True) for f in header if f.startswith('TPM ') or f.startswith('Coverage ') or f.startswith('Raw read ') or f.startswith('Raw base ') ]) outfile.write('\t'.join(goodFields) + '\n') idx = {f: i for i, f in enumerate(header) if f in newFields} for line in infile: line = line.strip().split('\t') if line[2] == 'CDS': allORFs.append(line[0]) outfile.write('{}\n'.format('\t'.join( [line[idx[f]] for f in newFields]))) ### Create contigtable. with open(perlVars['$contigtable']) as infile, open( '{}/contigs.tsv'.format(args.output_dir), 'w') as outfile: outfile.write(infile.readline()) outfile.write(new2old(infile.readline())) # adapt header [outfile.write(line) for line in infile] ### Create bintable. if not int(perlVars['$nobins']): with open(perlVars['$bintable']) as infile, open( '{}/bins.tsv'.format(args.output_dir), 'w') as outfile: outfile.write(infile.readline()) outfile.write(new2old(infile.readline(), bintable=True)) # adapt header [outfile.write(line) for line in infile] ### Create sequences file. aafile = perlVars['$aafile'] fna_blastx = perlVars['$fna_blastx'] if int( perlVars['$doublepass']) else None outname = '{}/sequences.tsv'.format(args.output_dir) write_orf_seqs(allORFs, aafile, fna_blastx, None, outname)
def main(args): ### Get result files paths from SqueezeMeta_conf.pl perlVars = parse_conf_file(args.project_path) ### Create output dir. try: mkdir(args.output_dir) except OSError as e: if e.errno != 17: raise ### Create samples file. with open(perlVars['$mappingfile']) as infile, open( '{}/samples.tsv'.format(args.output_dir), 'w') as outfile: outfile.write('Sample\ttest\n') addedSamples = set() for line in infile: sample = line.split('\t')[0].strip( ) # There shouldn't be trailing spaces though... if sample not in addedSamples: addedSamples.add(sample) outfile.write('{}\t{}\n'.format(sample, len(addedSamples))) ### Create orftable. allORFs = [] goodFields = [ 'ORF', 'CONTIG ID', 'LENGTH AA', 'GC perc', 'GENNAME', 'TAX ORF', 'KEGG ID', 'KEGGFUN', 'KEGGPATH', 'COG ID', 'COGFUN', 'COGPATH', 'PFAM' ] with open(perlVars['$mergedfile']) as infile, open( '{}/genes.tsv'.format(args.output_dir), 'w') as outfile: outfile.write(infile.readline()) header = infile.readline().strip().split('\t') goodFields.extend([ f for f in header if f.startswith('TPM ') or f.startswith('COVERAGE ') or f.startswith('RAW READ ') or f.startswith('RAW BASE ') ]) outfile.write('\t'.join(goodFields) + '\n') idx = {f: i for i, f in enumerate(header) if f in goodFields} for line in infile: line = line.strip().split('\t') if line[2] == 'CDS': allORFs.append(line[0]) outfile.write('{}\n'.format('\t'.join( [line[idx[f]] for f in goodFields]))) ### Create contigtable. system('cp {} {}/contigs.tsv'.format(perlVars['$contigtable'], args.output_dir)) ### Create bintable. if not int(perlVars['$nobins']): system('cp {} {}/bins.tsv'.format(perlVars['$bintable'], args.output_dir)) ### Create sequences file. # Load prodigal results. ORFseq = parse_fasta(perlVars['$aafile']) # Load blastx results if required. if int(perlVars['$doublepass']): ORFseq.update(parse_fasta(perlVars['$fna_blastx'])) # Write results. with open('{}/sequences.tsv'.format(args.output_dir), 'w') as outfile: outfile.write('ORF\tAASEQ\n') for ORF in allORFs: outfile.write('{}\t{}\n'.format(ORF, ORFseq[ORF]))
def main(args): ### Get result files paths from SqueezeMeta_conf.pl perlVars = parse_conf_file(args.project_path) nokegg, nocog, nopfam, doublepass = map(int, [ perlVars['$nokegg'], perlVars['$nocog'], perlVars['$nopfam'], perlVars['$doublepass'] ]) ### Create output dir. try: mkdir(args.output_dir) except OSError as e: if e.errno != 17: raise ### Calculate tables and write results. prefix = args.output_dir + '/' + perlVars['$projectname'] + '.' sampleNames, orfs, kegg, cog, pfam = parse_orf_table( perlVars['$mergedfile'], nokegg, nocog, nopfam, args.trusted_functions, args.ignore_unclassified) # Round aggregated functional abundances. # We can have non-integer abundances bc of the way we split counts in ORFs with multiple KEGGs. # We round the aggregates to the closest integer for convenience. kegg['abundances'] = { k: a.round().astype(int) for k, a in kegg['abundances'].items() } cog['abundances'] = { k: a.round().astype(int) for k, a in cog['abundances'].items() } pfam['abundances'] = { k: a.round().astype(int) for k, a in pfam['abundances'].items() } #write_results(sampleNames, orfs['tpm'], prefix + 'orf.tpm.tsv') if not nokegg: write_results(sampleNames, kegg['abundances'], prefix + 'KO.abund.tsv') write_results(sampleNames, kegg['tpm'], prefix + 'KO.tpm.tsv') if not nocog: write_results(sampleNames, cog['abundances'], prefix + 'COG.abund.tsv') write_results(sampleNames, cog['tpm'], prefix + 'COG.tpm.tsv') if not nopfam: write_results(sampleNames, pfam['abundances'], prefix + 'PFAM.abund.tsv') write_results(sampleNames, pfam['tpm'], prefix + 'PFAM.tpm.tsv') fun_prefix = perlVars['$fun3tax_blastx'] if doublepass else perlVars[ '$fun3tax'] orf_tax, orf_tax_wranks = parse_tax_table(fun_prefix + '.wranks') orf_tax_nofilter, orf_tax_nofilter_wranks = parse_tax_table( fun_prefix + '.noidfilter.wranks') # Add ORFs not present in the input tax file. unclass_list = ['Unclassified' for rank in TAXRANKS_SHORT] unclass_list_wranks = ['{}_Unclassified' for rank in TAXRANKS_SHORT] for orf in orfs['abundances']: if orf not in orf_tax: assert orf not in orf_tax_wranks assert orf not in orf_tax_nofilter assert orf not in orf_tax_nofilter_wranks orf_tax[orf] = unclass_list orf_tax_wranks[orf] = unclass_list_wranks orf_tax_nofilter[orf] = unclass_list orf_tax_nofilter_wranks[orf] = unclass_list_wranks orf_tax_prokfilter, orf_tax_prokfilter_wranks = {}, {} for orf in orf_tax: tax = orf_tax[orf] tax_nofilter = orf_tax_nofilter[orf] if 'Bacteria' in (tax[0], tax_nofilter[0]) or 'Archaea' in ( tax[0], tax_nofilter[0]): # We check both taxonomies. orf_tax_prokfilter[orf] = tax orf_tax_prokfilter_wranks[orf] = orf_tax_wranks[orf] else: orf_tax_prokfilter[orf] = tax_nofilter orf_tax_prokfilter_wranks[orf] = orf_tax_nofilter_wranks[orf] contig_abunds, contig_tax, contig_tax_wranks = parse_contig_table( perlVars['$contigtable']) if args.write_parsed_tax: write_results(TAXRANKS, orf_tax, prefix + 'orf.tax.allfilter.tsv') write_results(TAXRANKS, orf_tax_nofilter, prefix + 'orf.tax.nofilter.tsv') write_results(TAXRANKS, orf_tax_prokfilter, prefix + 'orf.tax.prokfilter.tsv') write_results(TAXRANKS, contig_tax, prefix + 'contig.tax.tsv') for idx, rank in enumerate(TAXRANKS): tax_abunds_orfs = aggregate_tax_abunds(orfs['abundances'], orf_tax_prokfilter, idx) write_results(sampleNames, tax_abunds_orfs, prefix + '{}.prokfilter.abund.orfs.tsv'.format(rank)) tax_abunds_contigs = aggregate_tax_abunds(contig_abunds, contig_tax_wranks, idx) write_results(sampleNames, tax_abunds_contigs, prefix + '{}.allfilter.abund.tsv'.format(rank)) write_results(sampleNames, normalize_abunds(tax_abunds_contigs, 100), prefix + '{}.allfilter.percent.tsv'.format(rank))