Beispiel #1
0
def find_offspring(taxonomy_folder, fastaid2LCAtaxid_file, log_file, quiet):
    nodes_dmp = '{0}/nodes.dmp'.format(taxonomy_folder)
    (taxid2parent, taxid2rank) = tax.import_nodes(nodes_dmp, log_file, quiet)

    message = 'Searching nr database for taxids with multiple offspring.'
    shared.give_user_feedback(message, log_file, quiet)

    taxid2offspring = {}

    with shared.open_maybe_gzip(fastaid2LCAtaxid_file, 'rt') as f1:
        for line in f1:
            line = line.rstrip().split('\t')

            taxid = line[1]
            lineage = tax.find_lineage(taxid, taxid2parent)

            for (i, taxid) in enumerate(lineage):
                # The first taxid in the lineage does not have a daughter node.
                if i == 0:
                    continue

                if taxid not in taxid2offspring:
                    taxid2offspring[taxid] = set()

                offspring = lineage[i - 1]

                taxid2offspring[taxid].add(offspring)

    return taxid2offspring
Beispiel #2
0
def import_contig_lengths(contigs_fasta, log_file, quiet):
    message = 'Gathering contig lengths from {0}.'.format(contigs_fasta)
    shared.give_user_feedback(message, log_file, quiet)

    contig2length = {}

    with shared.open_maybe_gzip(contigs_fasta, 'rt') as f1:
        for line in f1:
            line = line.rstrip()

            if line.startswith('>'):
                contig = line.split(' ')[0].lstrip('>')

                contig2length[contig] = 0
            else:
                try:
                    contig2length[contig] += len(line)
                except:
                    message = ('ERROR: {0} is not a contigs fasta'
                               ''.format(contigs_fasta))
                    shared.give_user_feedback(message,
                                              log_file,
                                              quiet,
                                              error=True)

                    sys.exit(1)

    return contig2length
Beispiel #3
0
def make_concatenated_fasta(concatenated_fasta, bin2contigs, bin_folder,
                            log_file, quiet):
    message = 'Writing {0}.'.format(concatenated_fasta)
    shared.give_user_feedback(message, log_file, quiet)

    with shared.open_maybe_gzip(concatenated_fasta, 'wt') as outf1:
        for bin_ in sorted(bin2contigs):
            with shared.open_maybe_gzip('{0}/{1}'.format(bin_folder, bin_),
                                        'rt') as f1:
                for line in f1:
                    if line.startswith('>'):
                        contig = line.split(' ')[0].rstrip().lstrip('>')

                        # add bin name in front of the contig name.
                        outf1.write('>{0}_{1}\n'.format(bin_, contig))
                    else:
                        outf1.write(line)
Beispiel #4
0
def write_taxids_with_multiple_offspring_file(
        taxids_with_multiple_offspring_file, taxid2offspring, log_file, quiet):
    message = 'Writing {0}.'.format(taxids_with_multiple_offspring_file)
    shared.give_user_feedback(message, log_file, quiet)

    with shared.open_maybe_gzip(taxids_with_multiple_offspring_file,
                                'wt') as outf1:
        for taxid in taxid2offspring:
            if len(taxid2offspring[taxid]) >= 2:
                outf1.write('{0}\n'.format(taxid))
Beispiel #5
0
def import_bins(bin_folder, bin_suffix, log_file, quiet):
    message = 'Importing bins from {0}/.'.format(bin_folder)
    shared.give_user_feedback(message, log_file, quiet)

    bin2contigs = {}
    contig_names = set()

    for file_ in os.listdir(bin_folder):
        if file_.startswith('.'):
            # Skip hidden files.
            continue

        if not file_.endswith(bin_suffix):
            continue

        if '.concatenated.' in file_:
            # Skip concatenated contig fasta and predicted protein fasta files
            # from earlier runs.
            continue

        # Keep the suffix in the bin name.
        bin_ = file_

        bin2contigs[bin_] = []

        with shared.open_maybe_gzip('{0}/{1}'.format(bin_folder, file_),
                                    'rt') as f1:
            for line in f1:
                if line.startswith('>'):
                    contig = line.split(' ')[0].rstrip().lstrip('>')

                    # Add bin name in front of the contig name.
                    new_contig_name = '{0}_{1}'.format(bin_, contig)

                    if new_contig_name in contig_names:
                        message = ('ERROR: BAT has encountered {0} twice in '
                                   'bin {1}. Each fasta header should be '
                                   'unique in each bin.'
                                   ''.format(contig, bin_))
                        shared.give_user_feedback(message,
                                                  log_file,
                                                  quiet,
                                                  error=True)

                        sys.exit(1)

                    contig_names.add(new_contig_name)

                    bin2contigs[bin_].append(new_contig_name)

    message = '{0} bin(s) found!'.format(len(bin2contigs))
    shared.give_user_feedback(message, log_file, quiet)

    return (bin2contigs, contig_names)
Beispiel #6
0
def check_whether_file_is_fasta(file_):
    is_fasta = False

    if not os.path.isfile(file_):
        return is_fasta

    with shared.open_maybe_gzip(file_, 'rt') as f1:
        for line in f1:
            if line.startswith('>'):
                is_fasta = True

            break

    return is_fasta
Beispiel #7
0
def import_taxids_with_multiple_offspring(taxids_with_multiple_offspring_file,
                                          log_file,
                                          quiet):
    message = 'Importing file {0}.'.format(taxids_with_multiple_offspring_file)
    shared.give_user_feedback(message, log_file, quiet)

    taxids_with_multiple_offspring = set()

    with shared.open_maybe_gzip(taxids_with_multiple_offspring_file, 'rt') as f1:
        for line in f1:
            line = line.rstrip()

            taxids_with_multiple_offspring.add(line)

    return taxids_with_multiple_offspring
Beispiel #8
0
def import_fastaid2LCAtaxid(fastaid2LCAtaxid_file, all_hits, log_file, quiet):
    message = 'Importing file {0}.'.format(fastaid2LCAtaxid_file)
    shared.give_user_feedback(message, log_file, quiet)

    fastaid2LCAtaxid = {}

    with shared.open_maybe_gzip(fastaid2LCAtaxid_file, 'rt') as f1:
        for line in f1:
            line = line.rstrip().split('\t')

            if line[0] in all_hits:
                # Only include fastaids that are found in hits.
                fastaid2LCAtaxid[line[0]] = line[1]

    return fastaid2LCAtaxid
Beispiel #9
0
def import_names(names_dmp, log_file, quiet):
    message = 'Importing file {0}.'.format(names_dmp)
    shared.give_user_feedback(message, log_file, quiet)

    taxid2name = {}

    with shared.open_maybe_gzip(names_dmp, 'rt') as f1:
        for line in f1:
            line = line.split('\t')

            if line[6] == 'scientific name':
                taxid = line[0]
                name = line[2]

                taxid2name[taxid] = name

    return taxid2name
Beispiel #10
0
def import_nodes(nodes_dmp, log_file, quiet):
    message = 'Importing file {0}.'.format(nodes_dmp)
    shared.give_user_feedback(message, log_file, quiet)
    
    taxid2parent = {}
    taxid2rank = {}


    with shared.open_maybe_gzip(nodes_dmp, 'rt') as f1:
        for line in f1:
            line = line.split('\t')

            taxid = line[0]
            parent = line[2]
            rank = line[4]

            taxid2parent[taxid] = parent
            taxid2rank[taxid] = rank

    return (taxid2parent, taxid2rank)
Beispiel #11
0
def make_fastaid2LCAtaxid_file(taxonomy_folder, fastaid2LCAtaxid_file, nr_file,
                               prot_accession2taxid_file, log_file, quiet):
    prot_accession2taxid = import_prot_accession2taxid(
        prot_accession2taxid_file, log_file, quiet)
    nodes_dmp = '{0}/nodes.dmp'.format(taxonomy_folder)
    (taxid2parent, taxid2rank) = tax.import_nodes(nodes_dmp, log_file, quiet)

    message = ('Finding LCA of all protein accession numbers in fasta headers '
               'of {0}. Please be patient...'.format(nr_file))
    shared.give_user_feedback(message, log_file, quiet)

    corrected = 0
    total = 0
    with gzip.open(nr_file,
                   'rt') as f1, shared.open_maybe_gzip(fastaid2LCAtaxid_file,
                                                       'wt') as outf1:
        for line in f1:
            if not line.startswith('>'):
                continue

            line = line.lstrip('>').split('\x01')

            accession_numbers = [i.split(' ')[0] for i in line]
            fastaid = accession_numbers[0]

            list_of_lineages = []
            for accession_number in accession_numbers:
                try:
                    taxid = prot_accession2taxid[accession_number]
                    lineage = tax.find_lineage(taxid, taxid2parent)
                    list_of_lineages.append(lineage)
                except:
                    # This accounts for missing accession numbers in
                    # prot.accession2taxid and missing nodes in nodes.dmp.
                    continue

            total += 1

            if len(list_of_lineages) == 0:
                # This accounts for entries that only contain accession numbers
                # that are missing in prot.accession2taxid or whose taxid is
                # missing in nodes.dmp. Note that these entries are thus not
                # present in the output file.
                continue

            LCAtaxid = tax.find_LCA(list_of_lineages)

            outf1.write('{0}\t{1}\n'.format(fastaid, LCAtaxid))

            try:
                if LCAtaxid != prot_accession2taxid[fastaid]:
                    corrected += 1
            except:
                # If the fastaid cannot be found in prot.accession2taxid, but
                # a taxid is given to the fastaid based on secondary accession
                # numbers, it is counted as a correction as well.
                corrected += 1

    message = ('Done! File {0} is created. '
               '{1} of {2} headers ({3:.1f}%) corrected. Please wait '
               'patiently for Python to collect garbage.'
               ''.format(fastaid2LCAtaxid_file, corrected, total,
                         corrected / total * 100))
    shared.give_user_feedback(message, log_file, quiet)
Beispiel #12
0
def contigs(args):
    step_list = []

    (contigs_fasta, database_folder, taxonomy_folder, r, one_minus_r, f,
     out_prefix, predicted_proteins_fasta, diamond_file, path_to_prodigal,
     path_to_diamond, no_stars, compress, force, quiet, no_log, nproc,
     sensitive, block_size, index_chunks, tmpdir,
     top) = check.convert_arguments(args)

    if no_log:
        log_file = None
    else:
        # Check out_prefix already as the log file needs to be written to a
        # valid location.
        error = check.check_out_prefix(out_prefix, None, quiet)
        if error:
            sys.exit(1)

        log_file = '{0}.log'.format(out_prefix)
        with open(log_file, 'w') as outf1:
            pass

    message = '# CAT v{0}.'.format(about.__version__)
    shared.give_user_feedback(message, log_file, quiet, show_time=False)

    # Check at which state to start.
    if predicted_proteins_fasta is None and diamond_file is None:
        message = ('\n'
                   'CAT is running. Protein prediction, alignment, and contig '
                   'classification are carried out.\n'
                   'Rarw!\n\n'
                   'Supplied command: {0}\n\n'
                   'Contigs fasta: {1}\n'
                   'Taxonomy folder: {2}/\n'
                   'Database folder: {3}/\n'
                   'Parameter r: {4}\n'
                   'Parameter f: {5}\n'
                   'Log file: {6}\n\n'
                   '-----------------\n'.format(' '.join(sys.argv),
                                                contigs_fasta, taxonomy_folder,
                                                database_folder, args.r,
                                                args.f, log_file))
        shared.give_user_feedback(message, log_file, quiet, show_time=False)

        step_list.append('run_prodigal')
        step_list.append('run_diamond')
    elif (predicted_proteins_fasta is not None and diamond_file is None):
        message = ('\n'
                   'CAT is running. Since a predicted protein fasta is '
                   'supplied, only alignment and contig classification are '
                   'carried out.\n'
                   'Rarw!\n\n'
                   'Supplied command: {0}\n\n'
                   'Contigs fasta: {1}\n'
                   'Taxonomy folder: {2}/\n'
                   'Database folder: {3}/\n'
                   'Parameter r: {4}\n'
                   'Parameter f: {5}\n'
                   'Log file: {6}\n\n'
                   '-----------------\n'.format(' '.join(sys.argv),
                                                contigs_fasta, taxonomy_folder,
                                                database_folder, args.r,
                                                args.f, log_file))
        shared.give_user_feedback(message, log_file, quiet, show_time=False)

        step_list.append('run_diamond')
    elif (predicted_proteins_fasta is not None and diamond_file is not None):
        message = ('\n'
                   'CAT is running. Since a predicted protein fasta and '
                   'DIAMOND alignment file are supplied, only contig '
                   'classification is carried out.\n'
                   'Rarw!\n\n'
                   'Supplied command: {0}\n\n'
                   'Contigs fasta: {1}\n'
                   'Taxonomy folder: {2}/\n'
                   'Database folder: {3}/\n'
                   'Parameter r: {4}\n'
                   'Parameter f: {5}\n'
                   'Log file: {6}\n\n'
                   '-----------------\n'.format(' '.join(sys.argv),
                                                contigs_fasta, taxonomy_folder,
                                                database_folder, args.r,
                                                args.f, log_file))
        shared.give_user_feedback(message, log_file, quiet, show_time=False)
    elif (predicted_proteins_fasta is None and diamond_file is not None):
        message = ('ERROR: if you want CAT to directly do the classification, '
                   'you should not only supply a DIAMOND alignment table but '
                   'also a predicted protein fasta file with argument '
                   '[-p / --proteins].')
        shared.give_user_feedback(message, log_file, quiet, error=True)

        sys.exit(1)

    # Check binaries, output files, taxonomy folder and database folder, and
    # set parameters.
    message = 'Doing some pre-flight checks first.'
    shared.give_user_feedback(message, log_file, quiet, show_time=False)

    errors = []

    errors.append(check.check_out_prefix(out_prefix, log_file, quiet))

    if 'run_prodigal' in step_list:
        errors.append(
            check.check_prodigal_binaries(path_to_prodigal, log_file, quiet))

        predicted_proteins_fasta = ('{0}.predicted_proteins.faa'
                                    ''.format(out_prefix))
        predicted_proteins_gff = ('{0}.predicted_proteins.gff'
                                  ''.format(out_prefix))

        if not force:
            errors.append(
                check.check_output_file(predicted_proteins_fasta, log_file,
                                        quiet))
            errors.append(
                check.check_output_file(predicted_proteins_gff, log_file,
                                        quiet))

    compress_suffix = ".gz" if compress else ""

    if 'run_diamond' in step_list:
        errors.append(
            check.check_diamond_binaries(path_to_diamond, log_file, quiet))

        diamond_file = '{0}.alignment.diamond{1}'.format(
            out_prefix, compress_suffix)

        if not force:
            errors.append(
                check.check_output_file(diamond_file, log_file, quiet))
    else:
        diamond_file = diamond_file

    errors.append(
        check.check_folders_for_run(taxonomy_folder, database_folder,
                                    step_list, log_file, quiet))

    contig2classification_output_file = ('{0}.contig2classification.txt{1}'
                                         ''.format(out_prefix,
                                                   compress_suffix))
    ORF2LCA_output_file = '{0}.ORF2LCA.txt{1}'.format(out_prefix,
                                                      compress_suffix)

    if not force:
        errors.append(
            check.check_output_file(contig2classification_output_file,
                                    log_file, quiet))
        errors.append(
            check.check_output_file(ORF2LCA_output_file, log_file, quiet))

    if 'run_prodigal' not in step_list:
        if not check.check_whether_file_is_fasta(predicted_proteins_fasta):
            message = ('ERROR: {0} is not a fasta file.'
                       ''.format(predicted_proteins_fasta))
            shared.give_user_feedback(message, log_file, quiet, error=True)

            errors.append(True)

    errors.append(check.check_top(top, r, log_file, quiet))

    if True in errors:
        sys.exit(1)

    (nodes_dmp, names_dmp, prot_accession2taxid_file
     ) = check.inspect_taxonomy_folder(taxonomy_folder)
    (nr_file, diamond_database, fastaid2LCAtaxid_file,
     taxids_with_multiple_offspring_file
     ) = check.inspect_database_folder(database_folder)

    message = 'Ready to fly!\n\n-----------------\n'
    shared.give_user_feedback(message, log_file, quiet, show_time=False)

    # Start CAT.
    contig_names = shared.import_contig_names(contigs_fasta, log_file, quiet)

    if 'run_prodigal' in step_list:
        shared.run_prodigal(path_to_prodigal, contigs_fasta,
                            predicted_proteins_fasta, predicted_proteins_gff,
                            tmpdir, log_file, quiet)

    contig2ORFs = shared.import_ORFs(predicted_proteins_fasta, log_file, quiet)

    check.check_whether_ORFs_are_based_on_contigs(contig_names, contig2ORFs,
                                                  log_file, quiet)

    if 'run_diamond' in step_list:
        shared.run_diamond(path_to_diamond, diamond_database,
                           predicted_proteins_fasta, diamond_file, nproc,
                           sensitive, block_size, index_chunks, tmpdir, top,
                           log_file, compress, quiet)

    (ORF2hits, all_hits) = shared.parse_diamond_file(diamond_file, one_minus_r,
                                                     log_file, quiet)

    (taxid2parent, taxid2rank) = tax.import_nodes(nodes_dmp, log_file, quiet)
    fastaid2LCAtaxid = tax.import_fastaid2LCAtaxid(fastaid2LCAtaxid_file,
                                                   all_hits, log_file, quiet)
    taxids_with_multiple_offspring = tax.import_taxids_with_multiple_offspring(
        taxids_with_multiple_offspring_file, log_file, quiet)

    message = ('CAT is spinning! Files {0} and {1} are created.'
               ''.format(contig2classification_output_file,
                         ORF2LCA_output_file))
    shared.give_user_feedback(message, log_file, quiet)

    number_of_classified_contigs = 0

    with shared.open_maybe_gzip(contig2classification_output_file,
                                'wt') as outf1, shared.open_maybe_gzip(
                                    ORF2LCA_output_file, 'wt') as outf2:
        outf1.write('# contig\tclassification\treason\tlineage\t'
                    'lineage scores\n')
        outf2.write('# ORF\tlineage\tbit-score\n')

        for contig in sorted(contig_names):
            if contig not in contig2ORFs:
                outf1.write('{0}\tunclassified\tno ORFs found\n'
                            ''.format(contig))

                continue

            LCAs_ORFs = []

            for ORF in contig2ORFs[contig]:
                if ORF not in ORF2hits:
                    outf2.write('{0}\tORF has no hit to database\n'
                                ''.format(ORF))

                    continue

                (taxid,
                 top_bitscore) = tax.find_LCA_for_ORF(ORF2hits[ORF],
                                                      fastaid2LCAtaxid,
                                                      taxid2parent)

                if taxid.startswith('no taxid found'):
                    outf2.write('{0}\t{1}\t{2}\n'.format(
                        ORF, taxid, top_bitscore))
                else:
                    lineage = tax.find_lineage(taxid, taxid2parent)

                    if not no_stars:
                        lineage = tax.star_lineage(
                            lineage, taxids_with_multiple_offspring)

                    outf2.write('{0}\t{1}\t{2}\n'
                                ''.format(ORF, ';'.join(lineage[::-1]),
                                          top_bitscore))

                LCAs_ORFs.append((taxid, top_bitscore), )

            if len(LCAs_ORFs) == 0:
                outf1.write('{0}\tunclassified\tno hits to database\n'
                            ''.format(contig))

                continue

            (lineages, lineages_scores,
             based_on_number_of_ORFs) = tax.find_weighted_LCA(
                 LCAs_ORFs, taxid2parent, f)

            if lineages == 'no ORFs with taxids found.':
                outf1.write('{0}\tunclassified\t'
                            'hits not found in taxonomy files\n'
                            ''.format(contig))

                continue

            if lineages == 'no lineage whitelisted.':
                outf1.write('{0}\tunclassified\t'
                            'no lineage reached minimum bit-score support\n'
                            ''.format(contig))

                continue

            # The contig has a valid classification.
            number_of_classified_contigs += 1

            for (i, lineage) in enumerate(lineages):
                if not no_stars:
                    lineage = tax.star_lineage(lineage,
                                               taxids_with_multiple_offspring)
                scores = [
                    '{0:.2f}'.format(score) for score in lineages_scores[i]
                ]

                if len(lineages) == 1:
                    # There is only one classification.
                    outf1.write('{0}\tclassified\t'
                                'based on {1}/{2} ORFs\t{3}\t{4}\n'
                                ''.format(contig, based_on_number_of_ORFs,
                                          len(contig2ORFs[contig]),
                                          ';'.join(lineage[::-1]),
                                          ';'.join(scores[::-1])))
                else:
                    # There are multiple classifications.
                    outf1.write('{0}\tclassified ({1}/{2})\t'
                                'based on {3}/{4} ORFs\t{5}\t{6}\n'
                                ''.format(contig, i + 1, len(lineages),
                                          based_on_number_of_ORFs,
                                          len(contig2ORFs[contig]),
                                          ';'.join(lineage[::-1]),
                                          ';'.join(scores[::-1])))

    message = ('\n-----------------\n\n'
               '[{0}] CAT is done! {1}/{2} contigs classified.'
               ''.format(datetime.datetime.now(), number_of_classified_contigs,
                         len(contig_names)))
    shared.give_user_feedback(message, log_file, quiet, show_time=False)

    if f < 0.5:
        message = ('\nWARNING: since f is set to smaller than 0.5, one '
                   'contig may have multiple classifications.')
        shared.give_user_feedback(message, log_file, quiet, show_time=False)
Beispiel #13
0
def summarise_bins(input_file, output_file, force, quiet):
    # Currently summarise does not a allow for a log file.
    log_file = None

    message = '# CAT v{0}.'.format(about.__version__)
    shared.give_user_feedback(message, log_file, quiet, show_time=False)

    errors = []

    errors.append(check.check_input_file(input_file, log_file, quiet))

    if not force:
        errors.append(check.check_output_file(output_file, log_file, quiet))

    if True in errors:
        sys.exit(1)

    message = 'Summarising...'
    shared.give_user_feedback(message, log_file, quiet)

    with shared.open_maybe_gzip(input_file, 'rt') as f1:
        for line in f1:
            if line.startswith('#'):
                line = line.split('\t')

                if line[0] != '# bin':
                    message = ('ERROR: {0} is not a BAT classification file.'
                               ''.format(input_file))
                    shared.give_user_feedback(message,
                                              log_file,
                                              quiet,
                                              error=True)

                    if line[0] == '# contig':
                        message = ('ERROR: {0} appears to be a CAT '
                                   'classification file. If you want to '
                                   'summarise contig classifications, please '
                                   'supply a contigs fasta.'
                                   ''.format(input_file))
                        shared.give_user_feedback(message,
                                                  log_file,
                                                  quiet,
                                                  error=True)

                    sys.exit(1)

                try:
                    superkingdom_index = line.index('superkingdom')
                except:
                    message = ('ERROR: official ranks not found in header of '
                               '{0}. Make sure that the BAT classification '
                               'file is named with official ranks with \'CAT '
                               'add_names --only_official\'.'
                               ''.format(input_file))
                    shared.give_user_feedback(message,
                                              log_file,
                                              quiet,
                                              error=True)

                    sys.exit(1)

                break
        else:
            message = 'ERROR: input file does not have a recognisable header.'
            shared.give_user_feedback(message, log_file, quiet, error=True)

            sys.exit(1)

    number_of_bins = {}
    number_of_bins['unclassified'] = 0

    official_ranks = [
        'superkingdom', 'phylum', 'class', 'order', 'family', 'genus',
        'species'
    ]

    for rank in official_ranks:
        number_of_bins[rank] = {}

    n = 0
    bin_trace = set()
    doubles = set()
    with open(input_file, 'r') as f1:
        for line in f1:
            line = line.rstrip()

            if line.startswith('#'):
                continue

            n += 1

            line = line.split('\t')

            bin_ = line[0]

            if bin_ in bin_trace:
                doubles.add(bin_)

            bin_trace.add(bin_)

            if line[1] == 'unclassified':
                number_of_bins['unclassified'] += 1

                continue

            for (i, classification) in enumerate(line[superkingdom_index:]):
                classification = classification.rsplit(': ', 1)[0].rstrip('*')

                rank = official_ranks[i]

                if classification not in number_of_bins[rank]:
                    number_of_bins[rank][classification] = 0

                number_of_bins[rank][classification] += 1

    if len(doubles) != 0:
        message = ('ERROR: some bins have multiple classifications. CAT '
                   'summarise currently does not allow for this. Bins with '
                   'multiple classifications: {0}.'
                   ''.format(', '.join(list(doubles))))
        shared.give_user_feedback(message, log_file, quiet, error=True)

        sys.exit(1)

    number_of_classified_bins = n - number_of_bins['unclassified']

    with shared.open_maybe_gzip(output_file, 'wt') as outf1:
        outf1.write('# total number of bins is {0}, of which {1} ({2:.2f}%) '
                    'are classified.\n'
                    ''.format(n, number_of_classified_bins,
                              number_of_classified_bins / n * 100))
        outf1.write('#\n')
        outf1.write('# rank\tclade\tnumber of bins\n')

        for rank in official_ranks:
            for clade in sorted(number_of_bins[rank],
                                key=lambda x: number_of_bins[rank][x],
                                reverse=True):
                outf1.write('{0}\t{1}\t{2}\n'
                            ''.format(rank, clade,
                                      number_of_bins[rank][clade]))

    message = '{0} is created!'.format(output_file)
    shared.give_user_feedback(message, log_file, quiet)
Beispiel #14
0
def summarise_contigs(input_file, output_file, contigs_fasta, force, quiet):
    # Currently summarise does not a allow for a log file.
    log_file = None

    message = '# CAT v{0}.'.format(about.__version__)
    shared.give_user_feedback(message, log_file, quiet, show_time=False)

    errors = []

    errors.append(check.check_input_file(input_file, log_file, quiet))

    if not force:
        errors.append(check.check_output_file(output_file, log_file, quiet))

    if True in errors:
        sys.exit(1)

    contig2length = import_contig_lengths(contigs_fasta, log_file, quiet)

    message = 'Summarising...'
    shared.give_user_feedback(message, log_file, quiet)

    with shared.open_maybe_gzip(input_file, 'rt') as f1:
        for line in f1:
            if line.startswith('#'):
                line = line.split('\t')

                if line[0] != '# contig':
                    message = ('ERROR: {0} is not a CAT classification file.'
                               ''.format(input_file))
                    shared.give_user_feedback(message,
                                              log_file,
                                              quiet,
                                              error=True)

                    if line[0] == '# bin':
                        message = ('ERROR: {0} appears to be a BAT '
                                   'classification file. If you want to '
                                   'summarise bin classifications, just '
                                   'don\'t supply a contigs fasta and '
                                   'everything should be fine!'
                                   ''.format(input_file))
                        shared.give_user_feedback(message,
                                                  log_file,
                                                  quiet,
                                                  error=True)

                    sys.exit(1)

                try:
                    superkingdom_index = line.index('superkingdom')
                except:
                    message = ('ERROR: official ranks not found in header of '
                               '{0}. Make sure that the CAT classification '
                               'file is named with official ranks with \'CAT '
                               'add_names --only_official\'.'
                               ''.format(input_file))
                    shared.give_user_feedback(message,
                                              log_file,
                                              quiet,
                                              error=True)

                    sys.exit(1)

                break
        else:
            message = 'ERROR: input file does not have a recognisable header.'
            shared.give_user_feedback(message, log_file, quiet, error=True)

            sys.exit(1)

    length = {}
    length['unclassified'] = []

    ORFs = {}

    official_ranks = [
        'superkingdom', 'phylum', 'class', 'order', 'family', 'genus',
        'species'
    ]

    for rank in official_ranks:
        length[rank] = {}
        ORFs[rank] = {}

    n = 0
    contig_trace = set()
    doubles = set()
    with shared.open_maybe_gzip(input_file, 'rt') as f1:
        for line in f1:
            line = line.rstrip()

            if line.startswith('#'):
                continue

            n += 1

            line = line.split('\t')

            contig = line[0]

            if contig in contig_trace:
                doubles.add(contig)

            contig_trace.add(contig)

            if contig not in contig2length:
                message = ('ERROR: contig {0} in CAT classification file is '
                           'not found in supplied contigs fasta file. Are you '
                           'sure the CAT classification file is based on the '
                           'contigs fasta?'.format(contig))
                shared.give_user_feedback(message, log_file, quiet, error=True)

                sys.exit(1)

            if line[1] == 'unclassified':
                length['unclassified'].append(contig2length[contig])

                continue

            for (i, classification) in enumerate(line[superkingdom_index:]):
                classification = classification.rsplit(': ', 1)[0].rstrip('*')

                rank = official_ranks[i]

                if classification not in length[rank]:
                    length[rank][classification] = []

                    ORFs[rank][classification] = []

                length[rank][classification].append(contig2length[contig])

                # Note that the total number of ORFs on a contig is reproted,
                # not only the number of ORFs a classification is based on.
                ORFs_on_contig = int(line[2].split('/')[1].split(' ')[0])
                ORFs[rank][classification].append(ORFs_on_contig)

    if len(doubles) != 0:
        message = ('ERROR: some contigs have multiple classifications. CAT '
                   'summarise currently does not allow for this. Contigs with '
                   'multiple classifications: {0}.'
                   ''.format(', '.join(list(doubles))))
        shared.give_user_feedback(message, log_file, quiet, error=True)

        sys.exit(1)

    if n != len(contig2length):
        message = ('ERROR: the number of classified contigs is not the same '
                   'as the number of contigs in contigs fasta. Are you sure '
                   'the CAT classification file is based on the contigs '
                   'fasta?')
        shared.give_user_feedback(message, log_file, quiet, error=True)

        sys.exit(1)

    with shared.open_maybe_gzip(output_file, 'wt') as outf1:
        number_of_contigs = len(contig2length)
        total_length = sum(contig2length.values())
        number_of_classified_contigs = number_of_contigs - len(
            length['unclassified'])
        total_classified_length = total_length - sum(length['unclassified'])

        outf1.write('# total number of contigs in {0} is {1} representing {2} '
                    'positions.\n'
                    ''.format(contigs_fasta, number_of_contigs, total_length))
        outf1.write('# {0} contigs are classified ({1:.2f}%) representing {2} '
                    'positions ({3:.2f}%) in {4}.\n'
                    ''.format(
                        number_of_classified_contigs,
                        number_of_classified_contigs / number_of_contigs * 100,
                        total_classified_length,
                        total_classified_length / total_length * 100,
                        input_file))
        outf1.write('#\n')
        outf1.write('# rank\t'
                    'clade\t'
                    'number of contigs\t'
                    'number of ORFs\t'
                    'number of positions\n')

        for rank in official_ranks:
            for clade in sorted(length[rank],
                                key=lambda x: sum(length[rank][x]),
                                reverse=True):
                outf1.write('{0}\t{1}\t{2}\t{3}\t{4}\n'
                            ''.format(rank, clade, len(length[rank][clade]),
                                      sum(ORFs[rank][clade]),
                                      sum(length[rank][clade])))

    message = '{0} is created!'.format(output_file)
    shared.give_user_feedback(message, log_file, quiet)
Beispiel #15
0
def add_names(args):
    (input_file, output_file, taxonomy_folder, only_official, exclude_scores,
     force, quiet) = check.convert_arguments(args)

    # Currently add_names does not allow for a log file.
    log_file = None

    message = '# CAT v{0}.'.format(about.__version__)
    shared.give_user_feedback(message, log_file, quiet, show_time=False)

    errors = []

    errors.append(check.check_input_file(input_file, log_file, quiet))

    if not force:
        errors.append(check.check_output_file(output_file, log_file, quiet))

    if True in errors:
        sys.exit(1)

    (nodes_dmp, names_dmp, prot_accession2taxid_file
     ) = check.inspect_taxonomy_folder(taxonomy_folder)

    (taxid2parent, taxid2rank) = tax.import_nodes(nodes_dmp, log_file, quiet)
    taxid2name = tax.import_names(names_dmp, log_file, quiet)

    message = 'Appending names...'
    shared.give_user_feedback(message, log_file, quiet)

    with shared.open_maybe_gzip(input_file, 'rt') as f1:
        for line in f1:
            if line.startswith('#'):
                line = line.rstrip().split('\t')

                try:
                    lineage_index = line.index('lineage')
                except:
                    message = ('ERROR: {0} is not a supported classification '
                               'file.'.format(input_file))
                    shared.give_user_feedback(message,
                                              log_file,
                                              quiet,
                                              error=True)

                    sys.exit(1)

                try:
                    scores_index = line.index('lineage scores')
                except:
                    scores_index = None

                full_length = len(line)

                break
        else:
            message = ('ERROR: {0} is not a supported classification file.'
                       ''.format(input_file))
            shared.give_user_feedback(message, log_file, quiet, error=True)

            sys.exit(1)

    with shared.open_maybe_gzip(input_file,
                                'rt') as f1, shared.open_maybe_gzip(
                                    output_file, 'wt') as outf1:
        for line in f1:
            line = line.rstrip()

            if line.startswith('#'):
                if only_official:
                    outf1.write('{0}\tsuperkingdom\tphylum\tclass\torder\t'
                                'family\tgenus\tspecies\n'.format(line))
                else:
                    outf1.write('{0}\tfull lineage names\n'.format(line))

                continue

            line = line.split('\t')

            if len(line) != full_length:
                # Entry does not have a full annotation.
                outf1.write('{0}\n'.format('\t'.join(line)))

                continue

            if (line[1].startswith('no taxid found')
                    or line[2].startswith('no taxid found')):
                # ORF has database hits but the accession number is not found
                # in the taxonomy files.
                outf1.write('{0}\n'.format('\t'.join(line)))

                continue

            lineage = line[lineage_index].split(';')

            if scores_index and not exclude_scores:
                scores = line[scores_index].split(';')
            else:
                scores = None

            if only_official:
                names = tax.convert_to_official_names(lineage, taxid2rank,
                                                      taxid2name, scores)
            else:
                names = tax.convert_to_names(lineage, taxid2rank, taxid2name,
                                             scores)

            outf1.write('{0}\t{1}\n'.format('\t'.join(line), '\t'.join(names)))

    message = 'Names written to {0}!'.format(output_file)
    shared.give_user_feedback(message, log_file, quiet)