Esempio n. 1
0
 def tax_file_to_dict(path, out_dict):
     with open(path) as infile:
         for line in infile:
             fields = line.strip().split('\t')
             # .replace('_nofilter') so that reads from allfilter and nofilter have the same name, but reads from fwd and rev don't.
             read = path.replace('_nofilter', '') + fields[0]
             tax_string = fields[1] if len(fields) > 1 else 'n_Unclassified'
             tax, tax_wranks = parse_tax_string(tax_string)
             out_dict[read] = tax_wranks
Esempio n. 2
0
 def add_features(abunds, tax, tax_wranks, tax_nofilter,
                  tax_nofilter_wranks):
     unclass_list, unclass_list_wranks = parse_tax_string('n_Unclassified')
     for feat in abunds:
         if feat not in tax:
             assert feat not in tax_wranks
             assert feat not in tax_nofilter
             assert feat not in tax_nofilter_wranks
             tax[feat] = unclass_list
             tax_wranks[feat] = unclass_list_wranks
             tax_nofilter[feat] = unclass_list
             tax_nofilter_wranks[feat] = unclass_list_wranks
Esempio n. 3
0
def main(args):
    ### Get result files paths from SqueezeMeta_conf.pl
    perlVars = parse_conf_file(args.project_path,
                               override={'$projectdir': args.project_path})
    nokegg, nocog, nopfam, doublepass = map(int, [
        perlVars['$nokegg'], perlVars['$nocog'], perlVars['$nopfam'],
        perlVars['$doublepass']
    ])

    ### Create output dir.
    try:
        mkdir(args.output_dir)
    except OSError as e:
        if e.errno != 17:
            raise
        elif args.sqm2anvio or args.force_overwrite:  # We know what we are doing.
            pass
        else:
            print(
                '\nThe directory {} already exists. Please remove it or use a different output name.\n'
                .format(args.output_dir))
            exit(1)

    ### Calculate tables and write results.
    prefix = args.output_dir + '/' + perlVars['$projectname'] + '.'

    ### Functions
    if not args.sqm2anvio:
        # Were custom annotation databases used in this project?
        methods = [
            f.split('.')[-1] for f in listdir(perlVars['$resultpath'])
            if len(f.split('.')) > 2 and f.split('.')[-2] == 'fun3'
        ]
        customMethods = [
            method for method in methods
            if method not in ('kegg', 'cog', 'pfam', 'wranks')
        ]

        # Parse ORF table.
        sampleNames, orfs, kegg, cog, pfam, custom = parse_orf_table(
            perlVars['$mergedfile'], nokegg, nocog, nopfam,
            args.trusted_functions, args.ignore_unclassified, customMethods)

        # Round aggregated functional abundances.
        # We can have non-integer abundances bc of the way we split counts in ORFs with multiple KEGGs.
        # We round the aggregates to the closest integer for convenience.
        kegg['abundances'] = {
            k: a.round().astype(int)
            for k, a in kegg['abundances'].items()
        }
        cog['abundances'] = {
            k: a.round().astype(int)
            for k, a in cog['abundances'].items()
        }
        pfam['abundances'] = {
            k: a.round().astype(int)
            for k, a in pfam['abundances'].items()
        }

        #write_row_dict(sampleNames, orfs['tpm'], prefix + 'orf.tpm.tsv')
        if not nokegg:
            write_row_dict(['Name', 'Path'], kegg['info'],
                           prefix + 'KO.names.tsv')
            write_row_dict(sampleNames, kegg['abundances'],
                           prefix + 'KO.abund.tsv')
            write_row_dict(sampleNames, kegg['bases'], prefix + 'KO.bases.tsv')
            write_row_dict(sampleNames, kegg['tpm'], prefix + 'KO.tpm.tsv')
            if 'copyNumber' in kegg:
                write_row_dict(sampleNames, kegg['copyNumber'],
                               prefix + 'KO.copyNumber.tsv')
        if not nocog:
            write_row_dict(['Name', 'Path'], cog['info'],
                           prefix + 'COG.names.tsv')
            write_row_dict(sampleNames, cog['abundances'],
                           prefix + 'COG.abund.tsv')
            write_row_dict(sampleNames, cog['bases'], prefix + 'COG.bases.tsv')
            write_row_dict(sampleNames, cog['tpm'], prefix + 'COG.tpm.tsv')
            if 'copyNumber' in cog:
                write_row_dict(sampleNames, cog['copyNumber'],
                               prefix + 'COG.copyNumber.tsv')
                write_row_dict(sampleNames,
                               {'COG0468': cog['coverages']['COG0468']},
                               prefix + 'RecA.tsv')
        if not nopfam:
            write_row_dict(sampleNames, pfam['abundances'],
                           prefix + 'PFAM.abund.tsv')
            write_row_dict(sampleNames, pfam['bases'],
                           prefix + 'PFAM.bases.tsv')
            write_row_dict(sampleNames, pfam['tpm'], prefix + 'PFAM.tpm.tsv')
            if 'copyNumber' in pfam:
                write_row_dict(sampleNames, pfam['copyNumber'],
                               prefix + 'PFAM.copyNumber.tsv')
        for method, d in custom.items():
            write_row_dict(['Name'], d['info'], prefix + method + '.names.tsv')
            write_row_dict(sampleNames, d['abundances'],
                           prefix + method + '.abund.tsv')
            write_row_dict(sampleNames, d['bases'],
                           prefix + method + '.bases.tsv')
            write_row_dict(sampleNames, d['tpm'], prefix + method + '.tpm.tsv')
            if 'copyNumber' in d:
                write_row_dict(sampleNames, d['copyNumber'],
                               prefix + method + '.copyNumber.tsv')

    else:
        # Not super beautiful code. Just read the orf names and create a fake orf dict
        # since we need to know the names of all the orfs to create the taxonomy output.
        orfs = {'abundances': read_orf_names(perlVars['$mergedfile'])}

    ### Taxonomy.
    fun_prefix = perlVars['$fun3tax_blastx'] if doublepass else perlVars[
        '$fun3tax']
    orf_tax, orf_tax_wranks = parse_tax_table(fun_prefix + '.wranks')
    orf_tax_nofilter, orf_tax_nofilter_wranks = parse_tax_table(
        fun_prefix + '.noidfilter.wranks')

    # Add ORFs not present in the input tax file.
    unclass_list, unclass_list_wranks = parse_tax_string('n_Unclassified')
    for orf in orfs['abundances']:
        if orf not in orf_tax:
            assert orf not in orf_tax_wranks
            assert orf not in orf_tax_nofilter
            assert orf not in orf_tax_nofilter_wranks
            orf_tax[orf] = unclass_list
            orf_tax_wranks[orf] = unclass_list_wranks
            orf_tax_nofilter[orf] = unclass_list
            orf_tax_nofilter_wranks[orf] = unclass_list_wranks

    orf_tax_prokfilter, orf_tax_prokfilter_wranks = {}, {}

    for orf in orf_tax:
        tax = orf_tax[orf]
        tax_nofilter = orf_tax_nofilter[orf]
        if 'Bacteria' in (tax[0], tax_nofilter[0]) or 'Archaea' in (
                tax[0], tax_nofilter[0]):  # We check both taxonomies.
            orf_tax_prokfilter[orf] = tax
            orf_tax_prokfilter_wranks[orf] = orf_tax_wranks[orf]
        else:
            orf_tax_prokfilter[orf] = tax_nofilter
            orf_tax_prokfilter_wranks[orf] = orf_tax_nofilter_wranks[orf]

    contig_abunds, contig_tax, contig_tax_wranks = parse_contig_table(
        perlVars['$contigtable'])

    write_row_dict(TAXRANKS, orf_tax, prefix + 'orf.tax.allfilter.tsv')
    write_row_dict(TAXRANKS, contig_tax, prefix + 'contig.tax.tsv')

    if not args.sqm2anvio:
        fna_blastx = perlVars['$fna_blastx'] if doublepass else None
        write_orf_seqs(orfs['abundances'].keys(), perlVars['$aafile'],
                       fna_blastx, perlVars['$rnafile'],
                       perlVars['$trnafile'] + '.fasta',
                       prefix + 'orf.sequences.tsv')
        write_contig_seqs(perlVars['$contigsfna'],
                          prefix + 'contig.sequences.tsv')

        write_row_dict(TAXRANKS, orf_tax_nofilter,
                       prefix + 'orf.tax.nofilter.tsv')
        write_row_dict(TAXRANKS, orf_tax_prokfilter,
                       prefix + 'orf.tax.prokfilter.tsv')

        ### Bins
        if not int(perlVars['$nobins']):
            bin_tpm, bin_tax, bin_tax_wranks = parse_bin_table(
                perlVars['$bintable'])
            write_row_dict(TAXRANKS, bin_tax, prefix + 'bin.tax.tsv')

        for idx, rank in enumerate(TAXRANKS):
            tax_abunds_orfs = aggregate_tax_abunds(orfs['abundances'],
                                                   orf_tax_prokfilter_wranks,
                                                   idx)
            write_row_dict(sampleNames, tax_abunds_orfs,
                           prefix + '{}.prokfilter.abund.tsv'.format(rank))
            #write_row_dict(sampleNames, normalize_abunds(tax_abunds_orfs, 100), prefix + '{}.prokfilter.percent.tsv'.format(rank))

            tax_abunds_contigs = aggregate_tax_abunds(contig_abunds,
                                                      contig_tax_wranks, idx)
            write_row_dict(sampleNames, tax_abunds_contigs,
                           prefix + '{}.allfilter.abund.tsv'.format(rank))
Esempio n. 4
0
def main(args):
    ### Create output dir.
    try:
        mkdir(args.output_dir)
    except OSError as e:
        if e.errno != 17:
            raise
        elif args.force_overwrite:
            pass
        else:
            print(
                '\nThe directory {} already exists. Please remove it or use a different output name.\n'
                .format(args.output_dir))
            exit(1)

    ### Project name and samples.
    project_name = args.project_path.strip('/').split('/')[-1]
    output_prefix = project_name  #args.output_dir.strip('/').split('/')[-1]
    samples = defaultdict(int)
    with open('{}/{}.out.mappingstat'.format(args.project_path,
                                             project_name)) as infile:
        for line in infile:
            if line.startswith('#'):
                continue
            sample, file_, total_reads, reads_with_hits_to_nr = line.strip(
            ).split('\t')
            samples[sample] += int(total_reads)

    ### Parse taxonomy.

    def tax_file_to_dict(path, out_dict):
        with open(path) as infile:
            for line in infile:
                fields = line.strip().split('\t')
                # .replace('_nofilter') so that reads from allfilter and nofilter have the same name, but reads from fwd and rev don't.
                read = path.replace('_nofilter', '') + fields[0]
                tax_string = fields[1] if len(fields) > 1 else 'n_Unclassified'
                tax, tax_wranks = parse_tax_string(tax_string)
                out_dict[read] = tax_wranks

    tax_dict = {
        filt: {
            rank: {sample: defaultdict(int)
                   for sample in samples}
            for rank in TAXRANKS
        }
        for filt in TAXFILTERS
    }

    for sample in samples:

        read_tax = {'nofilter': {}, 'allfilter': {}, 'prokfilter': {}}

        ### Parse nofilter taxonomy.
        nofilter_tax_files = [
            f for f in listdir('{}/{}'.format(args.project_path, sample))
            if f.endswith('.tax_nofilter.wranks')
        ]
        for tax_file in nofilter_tax_files:
            path = '{}/{}/{}'.format(args.project_path, sample, tax_file)
            tax_file_to_dict(path, read_tax['nofilter'])

        ### Parse taxonomy with filters.
        allfilter_tax_files = [
            f for f in listdir('{}/{}'.format(args.project_path, sample))
            if f.endswith('.tax.wranks')
        ]
        for tax_file in allfilter_tax_files:
            path = '{}/{}/{}'.format(args.project_path, sample, tax_file)
            tax_file_to_dict(path, read_tax['allfilter'])

        assert read_tax['nofilter'].keys() == read_tax['allfilter'].keys()

        ### Generate taxonomy with filters only for prokaryotes.
        for read in read_tax['nofilter']:
            if 'k_Bacteria' in (
                    read_tax['nofilter'][read][0],
                    read_tax['allfilter'][read][0]) or 'k_Archaea' in (
                        read_tax['nofilter'][read][0],
                        read_tax['allfilter'][read][0]):
                read_tax['prokfilter'][read] = read_tax['allfilter'][read]
            else:
                read_tax['prokfilter'][read] = read_tax['nofilter'][read]

        ### Aggregate counts from the same taxa.
        for filt in TAXFILTERS:
            for read, tax in read_tax[filt].items():
                for i, rank in enumerate(TAXRANKS):
                    tax_dict[filt][rank][sample][tax[i]] += 1

    ### Add unclassified and write results.
    for filt in TAXFILTERS:
        if filt == 'nofilter':
            continue
        for i, rank in enumerate(TAXRANKS):
            dict_to_write = tax_dict[filt][rank]
            for sample, taxa in dict_to_write.items():
                classified_reads = sum(taxa.values())
                total_reads = samples[sample]
                dict_to_write[sample][parse_tax_string('n_Unclassified')[1]
                                      [i]] += (total_reads - classified_reads)
            DataFrame.from_dict(dict_to_write).fillna(0).to_csv(
                '{}/{}.{}.{}.abund.tsv'.format(args.output_dir, output_prefix,
                                               rank, filt),
                sep='\t')

    ### Parse functions.

    # Is there any custom annotation method apart from kegg and COG?
    custom_methods = [
        f.split('.')[-1].replace('fun', '') for f in listdir(args.project_path)
        if 'fun' in f.split('.')[-1] and not f.endswith('funcog')
        and not f.endswith('funkegg')
    ]
    for method in custom_methods:
        FUNMETHODS[method] = method

    fun_dict = {
        method: {sample: defaultdict(float)
                 for sample in samples}
        for method in FUNMETHODS
    }
    for sample in samples:
        for method in FUNMETHODS:
            fun_files = [
                f for f in listdir('{}/{}'.format(args.project_path, sample))
                if f.endswith(method)
            ]
            for fun_file in fun_files:
                with open('{}/{}/{}'.format(args.project_path, sample,
                                            fun_file)) as infile:
                    infile.readline()
                    infile.readline()
                    for line in infile:
                        fields = line.strip().split('\t')
                        while len(fields) < 3:
                            fields.append('Unclassified')
                        funs = fields[2] if args.trusted_functions else fields[
                            1]
                        funs = funs.split(';')
                        for fun in funs:
                            fun_dict[method][sample][fun] += 1 / len(
                                funs
                            )  # Split the counts in multi-kegg annotations.

    for method, method_name in FUNMETHODS.items():
        dict_to_write = fun_dict[method]
        for sample, funs in dict_to_write.items():
            classified_reads = sum(funs.values())
            total_reads = samples[sample]
            dict_to_write[sample]['Unclassified'] += (total_reads -
                                                      classified_reads)
        DataFrame.from_dict(dict_to_write).fillna(0).to_csv(
            '{}/{}.{}.abund.tsv'.format(args.output_dir, output_prefix,
                                        method_name),
            sep='\t')
def main(args):
    ### Create output dir.
    try:
        mkdir(args.output_dir)
    except OSError as e:
        if e.errno != 17:
            raise
        elif args.force_overwrite:
            pass
        else:
            print(
                '\nThe directory {} already exists. Please remove it or use a different output name.\n'
                .format(args.output_dir))
            exit(1)

    ### Project name and samples.
    project_name = args.project_path.strip('/').split('/')[-1]
    output_prefix = project_name  #args.output_dir.strip('/').split('/')[-1]
    samples = defaultdict(int)
    samples_orfs = defaultdict(int)
    with open('{}/{}.out.mappingstat'.format(args.project_path,
                                             project_name)) as infile:
        for line in infile:
            if line.startswith('#'):
                continue
            sample, file_, total_reads, reads_with_hits_to_nr, *total_hits = line.strip(
            ).split(
                '\t')  # *_ since longreads output will have one more column
            if total_hits:
                total_orfs = 0
                with open(
                        '{}/{}/{}.nt.fasta'.format(args.project_path, sample,
                                                   sample)
                ) as infile:  # in longreads mode we can have more than one ORF per read
                    for line in infile:
                        if line.startswith('>'):
                            total_orfs += 1
                samples_orfs[sample] = total_orfs
                longreads = True
            else:
                longreads = False
            samples[sample] += int(total_reads)

    ### Parse taxonomy.

    def tax_file_to_dict(path, out_dict):
        with open(path) as infile:
            for line in infile:
                fields = line.strip().split('\t')
                # .replace('_nofilter') so that reads from allfilter and nofilter have the same name, but reads from fwd and rev don't.
                read = path.replace('_nofilter', '') + fields[0]
                if not longreads:
                    tax_string = fields[1] if len(
                        fields) > 1 else 'n_Unclassified'
                else:
                    tax_string = fields[1] if fields[1] else 'n_Unclassified'
                tax, tax_wranks = parse_tax_string(tax_string)
                out_dict[read] = tax_wranks

    tax_dict = {
        filt: {
            rank: {sample: defaultdict(int)
                   for sample in samples}
            for rank in TAXRANKS
        }
        for filt in TAXFILTERS
    }

    for sample in samples:

        read_tax = {'nofilter': {}, 'allfilter': {}, 'prokfilter': {}}

        ### Parse nofilter taxonomy
        if not longreads:
            nofilter_tax_files = [
                f for f in listdir('{}/{}'.format(args.project_path, sample))
                if f.endswith('.tax_nofilter.wranks')
            ]
        else:
            nofilter_tax_files = ['readconsensus_nofilter.txt']
        for tax_file in nofilter_tax_files:
            path = '{}/{}/{}'.format(args.project_path, sample, tax_file)
            tax_file_to_dict(path, read_tax['nofilter'])

        ### Parse taxonomy with filters.
        if not longreads:
            allfilter_tax_files = [
                f for f in listdir('{}/{}'.format(args.project_path, sample))
                if f.endswith('.tax.wranks')
            ]
        else:
            allfilter_tax_files = ['readconsensus.txt']
        for tax_file in allfilter_tax_files:
            path = '{}/{}/{}'.format(args.project_path, sample, tax_file)
            tax_file_to_dict(path, read_tax['allfilter'])

        assert read_tax['nofilter'].keys() == read_tax['allfilter'].keys()

        ### Generate taxonomy with filters only for prokaryotes.
        for read in read_tax['nofilter']:
            if 'k_Bacteria' in (
                    read_tax['nofilter'][read][0],
                    read_tax['allfilter'][read][0]) or 'k_Archaea' in (
                        read_tax['nofilter'][read][0],
                        read_tax['allfilter'][read][0]):
                read_tax['prokfilter'][read] = read_tax['allfilter'][read]
            else:
                read_tax['prokfilter'][read] = read_tax['nofilter'][read]

        ### Aggregate counts from the same taxa.
        for filt in TAXFILTERS:
            for read, tax in read_tax[filt].items():
                for i, rank in enumerate(TAXRANKS):
                    tax_dict[filt][rank][sample][tax[i]] += 1

    ### Add unclassified and write results.
    for filt in TAXFILTERS:
        #if filt == 'nofilter':
        #    continue
        for i, rank in enumerate(TAXRANKS):
            dict_to_write = tax_dict[filt][rank]
            for sample, taxa in dict_to_write.items():
                classified_reads = sum(taxa.values())
                total_reads = samples[sample]
                dict_to_write[sample][parse_tax_string('n_Unclassified')[1]
                                      [i]] += (total_reads - classified_reads)
            DataFrame.from_dict(dict_to_write).fillna(0).to_csv(
                '{}/{}.{}.{}.abund.tsv'.format(args.output_dir, output_prefix,
                                               rank, filt),
                sep='\t')

    ### Parse functions.

    # Is there any custom annotation method apart from kegg and COG?
    custom_methods = [
        f.split('.')[-1].replace('fun', '') for f in listdir(args.project_path)
        if 'fun' in f.split('.')[-1] and not f.endswith('funcog')
        and not f.endswith('funkegg')
    ]
    for method in custom_methods:
        FUNMETHODS[method] = method

    found_methods = set()

    fun_dict = {
        method: {sample: defaultdict(float)
                 for sample in samples}
        for method in FUNMETHODS
    }
    for sample in samples:
        for method in FUNMETHODS:
            fun_files = [
                f for f in listdir('{}/{}'.format(args.project_path, sample))
                if f.endswith(method)
            ]
            for fun_file in fun_files:
                found_methods.add(method)
                with open('{}/{}/{}'.format(args.project_path, sample,
                                            fun_file)) as infile:
                    infile.readline()
                    infile.readline()
                    for line in infile:
                        fields = line.strip().split('\t')
                        while len(fields) < 3:
                            fields.append('Unclassified')
                        funs = fields[2] if args.trusted_functions else fields[
                            1]
                        funs = funs.split(';')
                        for fun in funs:
                            fun_dict[method][sample][fun] += 1 / len(
                                funs
                            )  # Split the counts in multi-kegg annotations.

    for method, method_name in FUNMETHODS.items():
        dict_to_write = fun_dict[method]
        for sample, funs in dict_to_write.items():
            classified_reads = sum(funs.values())
            if not longreads:
                total_reads = samples[sample]
            else:
                total_reads = samples_orfs[
                    sample]  # In longreads mode we can have more than one orf per read. For taxonomy we get the consensus for each reads, but in functions we count ORFs independently.
            dict_to_write[sample]['Unclassified'] += (total_reads -
                                                      classified_reads)
        DataFrame.from_dict(dict_to_write).fillna(0).to_csv(
            '{}/{}.{}.abund.tsv'.format(args.output_dir, output_prefix,
                                        method_name),
            sep='\t')

    # Write function names and hierarchy paths for kegg/cog.
    for method in found_methods:
        if method not in ('kegg', 'cogs'):
            continue
        method_name = FUNMETHODS[method]
        function_info = 'keggfun2.txt' if method == 'kegg' else 'coglist.txt'
        with open('{}/{}'.format(data_dir, function_info)) as infile, open(
                '{}/{}.{}.names.tsv'.format(args.output_dir, output_prefix,
                                            method_name), 'w') as outfile:
            infile.readline()  # Burn headers.
            info = {}
            for line in infile:
                if method == 'kegg':
                    fun_id, gene_name, fun_name, path = line.strip().split(
                        '\t')
                else:
                    line = line.strip().split('\t')
                    if len(line) == 3:
                        fun_id, fun_name, path = line
                    else:  # UGH!
                        fun_id, fun_name = line
                        path = '{} (path not available)', format(fun_id)
                info[fun_id] = (fun_name, path)
            allFuns = sorted({
                fun
                for sample in fun_dict[method]
                for fun in fun_dict[method][sample]
            })
            outfile.write('\tName\tPath\n')
            for fun in allFuns:
                if fun == 'Unclassified':
                    continue
                if fun in info:
                    outfile.write('{}\t{}\t{}\n'.format(
                        fun, info[fun][0], info[fun][1]))
                else:
                    outfile.write(
                        '{}\t{} (name not available)\t{} (path not available)\n'
                        .format(fun, fun, fun))
            outfile.write('Unclassified\tUnclassified\tUnclassified\n')

    # Write function names for extra methods.
    for method in found_methods:
        if method in ('kegg', 'cogs'):
            continue
        method_name = FUNMETHODS[method]
        written = set()
        if method in ('kegg', 'cogs'):
            continue
        with open('{}/{}.out.allreads.fun{}'.format(args.project_path, project_name, method)) as infile, \
             open('{}/{}.{}.names.tsv'.format(args.output_dir, output_prefix, method_name), 'w') as outfile:
            infile.readline()  # Burn headers.
            infile.readline()
            outfile.write('\tName\n')
            for line in infile:
                line = line.strip('\n').split(
                    '\t'
                )  # Explicitly strip just '\n' so I don't remove tabs when there are empty fields.
                ID = line[0]
                if ID not in written:
                    written.add(ID)
                    outfile.write('{}\t{}\n'.format(ID, line[-1]))
            outfile.write('Unclassified\tUnclassified\n')