Beispiel #1
0
def main(args):
    ### Get result files paths from SqueezeMeta_conf.pl
    perlVars = parse_conf_file(args.project_path,
                               override={'$projectdir': args.project_path})
    nokegg, nocog, nopfam, doublepass = map(int, [
        perlVars['$nokegg'], perlVars['$nocog'], perlVars['$nopfam'],
        perlVars['$doublepass']
    ])

    ### Create output dir.
    try:
        mkdir(args.output_dir)
    except OSError as e:
        if e.errno != 17:
            raise
        elif args.sqm2anvio or args.force_overwrite:  # We know what we are doing.
            pass
        else:
            print(
                '\nThe directory {} already exists. Please remove it or use a different output name.\n'
                .format(args.output_dir))
            exit(1)

    ### Calculate tables and write results.
    prefix = args.output_dir + '/' + perlVars['$projectname'] + '.'

    ### Functions
    if not args.sqm2anvio:
        # Were custom annotation databases used in this project?
        methods = [
            f.split('.')[-1] for f in listdir(perlVars['$resultpath'])
            if len(f.split('.')) > 2 and f.split('.')[-2] == 'fun3'
        ]
        customMethods = [
            method for method in methods
            if method not in ('kegg', 'cog', 'pfam', 'wranks')
        ]

        # Parse ORF table.
        sampleNames, orfs, kegg, cog, pfam, custom = parse_orf_table(
            perlVars['$mergedfile'], nokegg, nocog, nopfam,
            args.trusted_functions, args.ignore_unclassified, customMethods)

        # Round aggregated functional abundances.
        # We can have non-integer abundances bc of the way we split counts in ORFs with multiple KEGGs.
        # We round the aggregates to the closest integer for convenience.
        kegg['abundances'] = {
            k: a.round().astype(int)
            for k, a in kegg['abundances'].items()
        }
        cog['abundances'] = {
            k: a.round().astype(int)
            for k, a in cog['abundances'].items()
        }
        pfam['abundances'] = {
            k: a.round().astype(int)
            for k, a in pfam['abundances'].items()
        }

        #write_row_dict(sampleNames, orfs['tpm'], prefix + 'orf.tpm.tsv')
        if not nokegg:
            write_row_dict(['Name', 'Path'], kegg['info'],
                           prefix + 'KO.names.tsv')
            write_row_dict(sampleNames, kegg['abundances'],
                           prefix + 'KO.abund.tsv')
            write_row_dict(sampleNames, kegg['bases'], prefix + 'KO.bases.tsv')
            write_row_dict(sampleNames, kegg['tpm'], prefix + 'KO.tpm.tsv')
            if 'copyNumber' in kegg:
                write_row_dict(sampleNames, kegg['copyNumber'],
                               prefix + 'KO.copyNumber.tsv')
        if not nocog:
            write_row_dict(['Name', 'Path'], cog['info'],
                           prefix + 'COG.names.tsv')
            write_row_dict(sampleNames, cog['abundances'],
                           prefix + 'COG.abund.tsv')
            write_row_dict(sampleNames, cog['bases'], prefix + 'COG.bases.tsv')
            write_row_dict(sampleNames, cog['tpm'], prefix + 'COG.tpm.tsv')
            if 'copyNumber' in cog:
                write_row_dict(sampleNames, cog['copyNumber'],
                               prefix + 'COG.copyNumber.tsv')
                write_row_dict(sampleNames,
                               {'COG0468': cog['coverages']['COG0468']},
                               prefix + 'RecA.tsv')
        if not nopfam:
            write_row_dict(sampleNames, pfam['abundances'],
                           prefix + 'PFAM.abund.tsv')
            write_row_dict(sampleNames, pfam['bases'],
                           prefix + 'PFAM.bases.tsv')
            write_row_dict(sampleNames, pfam['tpm'], prefix + 'PFAM.tpm.tsv')
            if 'copyNumber' in pfam:
                write_row_dict(sampleNames, pfam['copyNumber'],
                               prefix + 'PFAM.copyNumber.tsv')
        for method, d in custom.items():
            write_row_dict(['Name'], d['info'], prefix + method + '.names.tsv')
            write_row_dict(sampleNames, d['abundances'],
                           prefix + method + '.abund.tsv')
            write_row_dict(sampleNames, d['bases'],
                           prefix + method + '.bases.tsv')
            write_row_dict(sampleNames, d['tpm'], prefix + method + '.tpm.tsv')
            if 'copyNumber' in d:
                write_row_dict(sampleNames, d['copyNumber'],
                               prefix + method + '.copyNumber.tsv')

    else:
        # Not super beautiful code. Just read the orf names and create a fake orf dict
        # since we need to know the names of all the orfs to create the taxonomy output.
        orfs = {'abundances': read_orf_names(perlVars['$mergedfile'])}

    ### Taxonomy.
    fun_prefix = perlVars['$fun3tax_blastx'] if doublepass else perlVars[
        '$fun3tax']
    orf_tax, orf_tax_wranks = parse_tax_table(fun_prefix + '.wranks')
    orf_tax_nofilter, orf_tax_nofilter_wranks = parse_tax_table(
        fun_prefix + '.noidfilter.wranks')

    # Add ORFs not present in the input tax file.
    unclass_list, unclass_list_wranks = parse_tax_string('n_Unclassified')
    for orf in orfs['abundances']:
        if orf not in orf_tax:
            assert orf not in orf_tax_wranks
            assert orf not in orf_tax_nofilter
            assert orf not in orf_tax_nofilter_wranks
            orf_tax[orf] = unclass_list
            orf_tax_wranks[orf] = unclass_list_wranks
            orf_tax_nofilter[orf] = unclass_list
            orf_tax_nofilter_wranks[orf] = unclass_list_wranks

    orf_tax_prokfilter, orf_tax_prokfilter_wranks = {}, {}

    for orf in orf_tax:
        tax = orf_tax[orf]
        tax_nofilter = orf_tax_nofilter[orf]
        if 'Bacteria' in (tax[0], tax_nofilter[0]) or 'Archaea' in (
                tax[0], tax_nofilter[0]):  # We check both taxonomies.
            orf_tax_prokfilter[orf] = tax
            orf_tax_prokfilter_wranks[orf] = orf_tax_wranks[orf]
        else:
            orf_tax_prokfilter[orf] = tax_nofilter
            orf_tax_prokfilter_wranks[orf] = orf_tax_nofilter_wranks[orf]

    contig_abunds, contig_tax, contig_tax_wranks = parse_contig_table(
        perlVars['$contigtable'])

    write_row_dict(TAXRANKS, orf_tax, prefix + 'orf.tax.allfilter.tsv')
    write_row_dict(TAXRANKS, contig_tax, prefix + 'contig.tax.tsv')

    if not args.sqm2anvio:
        fna_blastx = perlVars['$fna_blastx'] if doublepass else None
        write_orf_seqs(orfs['abundances'].keys(), perlVars['$aafile'],
                       fna_blastx, perlVars['$rnafile'],
                       perlVars['$trnafile'] + '.fasta',
                       prefix + 'orf.sequences.tsv')
        write_contig_seqs(perlVars['$contigsfna'],
                          prefix + 'contig.sequences.tsv')

        write_row_dict(TAXRANKS, orf_tax_nofilter,
                       prefix + 'orf.tax.nofilter.tsv')
        write_row_dict(TAXRANKS, orf_tax_prokfilter,
                       prefix + 'orf.tax.prokfilter.tsv')

        ### Bins
        if not int(perlVars['$nobins']):
            bin_tpm, bin_tax, bin_tax_wranks = parse_bin_table(
                perlVars['$bintable'])
            write_row_dict(TAXRANKS, bin_tax, prefix + 'bin.tax.tsv')

        for idx, rank in enumerate(TAXRANKS):
            tax_abunds_orfs = aggregate_tax_abunds(orfs['abundances'],
                                                   orf_tax_prokfilter_wranks,
                                                   idx)
            write_row_dict(sampleNames, tax_abunds_orfs,
                           prefix + '{}.prokfilter.abund.tsv'.format(rank))
            #write_row_dict(sampleNames, normalize_abunds(tax_abunds_orfs, 100), prefix + '{}.prokfilter.percent.tsv'.format(rank))

            tax_abunds_contigs = aggregate_tax_abunds(contig_abunds,
                                                      contig_tax_wranks, idx)
            write_row_dict(sampleNames, tax_abunds_contigs,
                           prefix + '{}.allfilter.abund.tsv'.format(rank))
Beispiel #2
0
def main(args):
    ### Get result files paths from SqueezeMeta_conf.pl
    perlVars = parse_conf_file(args.project_path)

    ### Create output dir.
    try:
        mkdir(args.output_dir)
    except OSError as e:
        if e.errno != 17:
            raise
        else:
            print(
                '\nThe directory {} already exists. Please remove it or use a different output name.\n'
                .format(args.output_dir))
            exit(1)

    ### Create samples file.
    with open(perlVars['$mappingfile']) as infile, open(
            '{}/samples.tsv'.format(args.output_dir), 'w') as outfile:
        outfile.write('Sample\ttest\n')
        addedSamples = set()
        for line in infile:
            sample = line.split('\t')[0].strip(
            )  # There shouldn't be trailing spaces though...
            if sample not in addedSamples:
                addedSamples.add(sample)
                outfile.write('{}\t{}\n'.format(sample, len(addedSamples)))

    ### Create orftable.
    def new2old(str_, orftable=False, bintable=False):
        """Replace 1.0 headers with old headers, so we don't have to modify and re-deploy SQMdb"""
        if orftable:
            str_ = str_.replace('Coverage', 'COVERAGE').replace(
                'Raw read count',
                'RAW READ COUNT').replace('Raw base count', 'RAW BASE COUNT')
        else:
            str_ = str_.replace('Strain het',
                                'Strain Het').replace('Raw read count', 'Raw')
        if bintable:
            str_ = str_.replace('Length', 'Size')
        return str_

    allORFs = []
    newFields = [
        'ORF ID', 'Contig ID', 'Length AA', 'GC perc', 'Gene name', 'Tax',
        'KEGG ID', 'KEGGFUN', 'KEGGPATH', 'COG ID', 'COGFUN', 'COGPATH', 'PFAM'
    ]
    goodFields = [
        'ORF', 'CONTIG ID', 'LENGTH AA', 'GC perc', 'GENNAME', 'TAX ORF',
        'KEGG ID', 'KEGGFUN', 'KEGGPATH', 'COG ID', 'COGFUN', 'COGPATH', 'PFAM'
    ]
    with open(perlVars['$mergedfile']) as infile, open(
            '{}/genes.tsv'.format(args.output_dir), 'w') as outfile:
        outfile.write(infile.readline())
        header = infile.readline().strip().split('\t')
        newFields.extend([
            f for f in header
            if f.startswith('TPM ') or f.startswith('Coverage ')
            or f.startswith('Raw read ') or f.startswith('Raw base ')
        ])
        goodFields.extend([
            new2old(f, True) for f in header
            if f.startswith('TPM ') or f.startswith('Coverage ')
            or f.startswith('Raw read ') or f.startswith('Raw base ')
        ])
        outfile.write('\t'.join(goodFields) + '\n')
        idx = {f: i for i, f in enumerate(header) if f in newFields}
        for line in infile:
            line = line.strip().split('\t')
            if line[2] == 'CDS':
                allORFs.append(line[0])
                outfile.write('{}\n'.format('\t'.join(
                    [line[idx[f]] for f in newFields])))

    ### Create contigtable.
    with open(perlVars['$contigtable']) as infile, open(
            '{}/contigs.tsv'.format(args.output_dir), 'w') as outfile:
        outfile.write(infile.readline())
        outfile.write(new2old(infile.readline()))  # adapt header
        [outfile.write(line) for line in infile]

    ### Create bintable.
    if not int(perlVars['$nobins']):
        with open(perlVars['$bintable']) as infile, open(
                '{}/bins.tsv'.format(args.output_dir), 'w') as outfile:
            outfile.write(infile.readline())
            outfile.write(new2old(infile.readline(),
                                  bintable=True))  # adapt header
            [outfile.write(line) for line in infile]

    ### Create sequences file.
    aafile = perlVars['$aafile']
    fna_blastx = perlVars['$fna_blastx'] if int(
        perlVars['$doublepass']) else None
    outname = '{}/sequences.tsv'.format(args.output_dir)
    write_orf_seqs(allORFs, aafile, fna_blastx, None, outname)