Ejemplo n.º 1
0
def single_bin(args):
    step_list = []

    (bin_fasta, database_folder, taxonomy_folder, r, one_minus_r, f,
     out_prefix, predicted_proteins_fasta, diamond_file, path_to_prodigal,
     path_to_diamond, force, quiet, no_log, nproc, sensitive, block_size,
     index_chunks, tmpdir, top) = check.convert_arguments(args)

    if no_log:
        log_file = None
    else:
        # Check out_prefix already as the log file needs to be written to a
        # valid location.
        error = check.check_out_prefix(out_prefix, None, quiet)
        if error:
            sys.exit(1)

        log_file = '{0}.log'.format(out_prefix)
        with open(log_file, 'w') as outf1:
            pass

    message = '# CAT v{0}.'.format(about.__version__)
    shared.give_user_feedback(message, log_file, quiet, show_time=False)

    # Check at which state to start.
    if predicted_proteins_fasta is None and diamond_file is None:
        message = ('\n'
                   'BAT is running. Protein prediction, alignment, and bin '
                   'classification are carried out.\n'
                   'Rarw!\n\n'
                   'Supplied command: {0}\n\n'
                   'Bin fasta: {1}/\n'
                   'Taxonomy folder: {2}/\n'
                   'Database folder: {3}/\n'
                   'Parameter r: {4}\n'
                   'Parameter f: {5}\n'
                   'Log file: {6}\n\n'
                   '-----------------\n'.format(' '.join(sys.argv), bin_fasta,
                                                taxonomy_folder,
                                                database_folder, args.r,
                                                args.f, log_file))
        shared.give_user_feedback(message, log_file, quiet, show_time=False)

        step_list.append('run_prodigal')
        step_list.append('run_diamond')
    elif (predicted_proteins_fasta is not None and diamond_file is None):
        message = ('\n'
                   'BAT is running. Since a predicted protein fasta is '
                   'supplied, only alignment and bin classification are '
                   'carried out.\n'
                   'Rarw!\n\n'
                   'Supplied command: {0}\n\n'
                   'Bin fasta: {1}/\n'
                   'Taxonomy folder: {2}/\n'
                   'Database folder: {3}/\n'
                   'Parameter r: {4}\n'
                   'Parameter f: {5}\n'
                   'Log file: {6}\n\n'
                   '-----------------\n'.format(' '.join(sys.argv), bin_fasta,
                                                taxonomy_folder,
                                                database_folder, args.r,
                                                args.f, log_file))
        shared.give_user_feedback(message, log_file, quiet, show_time=False)

        step_list.append('run_diamond')
    elif (predicted_proteins_fasta is not None and diamond_file is not None):
        message = ('\n'
                   'BAT is running. Since a predicted protein fasta and '
                   'DIAMOND alignment file are supplied, only bin '
                   'classification is carried out.\n'
                   'Rarw!\n\n'
                   'Supplied command: {0}\n\n'
                   'Bin fasta: {1}/\n'
                   'Taxonomy folder: {2}/\n'
                   'Database folder: {3}/\n'
                   'Parameter r: {4}\n'
                   'Parameter f: {5}\n'
                   'Log file: {6}\n\n'
                   '-----------------\n'.format(' '.join(sys.argv), bin_fasta,
                                                taxonomy_folder,
                                                database_folder, args.r,
                                                args.f, log_file))
        shared.give_user_feedback(message, log_file, quiet, show_time=False)
    elif (predicted_proteins_fasta is None and diamond_file is not None):
        message = ('ERROR: if you want BAT to directly classify a single bin, '
                   'you should not only supply a DIAMOND alignment table but '
                   'also a predicted protein fasta file with argument '
                   '[-p / --proteins].')
        shared.give_user_feedback(message, log_file, quiet, error=True)

        sys.exit(1)

    # Check binaries, output files, taxonomy folder and database folder, and
    # set parameters.
    message = 'Doing some pre-flight checks first.'
    shared.give_user_feedback(message, log_file, quiet, show_time=False)

    errors = []

    errors.append(check.check_out_prefix(out_prefix, log_file, quiet))

    errors.append(check.check_bin_fasta(bin_fasta, log_file, quiet))

    if 'run_prodigal' in step_list:
        errors.append(
            check.check_prodigal_binaries(path_to_prodigal, log_file, quiet))

        predicted_proteins_fasta = ('{0}.predicted_proteins.faa'
                                    ''.format(out_prefix))
        predicted_proteins_gff = ('{0}.predicted_proteins.gff'
                                  ''.format(out_prefix))

        if not force:
            errors.append(
                check.check_output_file(predicted_proteins_fasta, log_file,
                                        quiet))
            errors.append(
                check.check_output_file(predicted_proteins_gff, log_file,
                                        quiet))

    if 'run_diamond' in step_list:
        errors.append(
            check.check_diamond_binaries(path_to_diamond, log_file, quiet))

        diamond_file = '{0}.alignment.diamond'.format(out_prefix)

        if not force:
            errors.append(
                check.check_output_file(diamond_file, log_file, quiet))
    else:
        diamond_file = diamond_file

    errors.append(
        check.check_folders_for_run(taxonomy_folder, database_folder,
                                    step_list, log_file, quiet))

    bin2classification_output_file = ('{0}.bin2classification.txt'
                                      ''.format(out_prefix))
    ORF2LCA_output_file = '{0}.ORF2LCA.txt'.format(out_prefix)

    if not force:
        errors.append(
            check.check_output_file(bin2classification_output_file, log_file,
                                    quiet))
        errors.append(
            check.check_output_file(ORF2LCA_output_file, log_file, quiet))

    if 'run_prodigal' not in step_list:
        if not check.check_whether_file_is_fasta(predicted_proteins_fasta):
            message = ('ERROR: {0} is not a fasta file.'
                       ''.format(predicted_proteins_fasta))
            shared.give_user_feedback(message, log_file, quiet, error=True)

            errors.append(True)

    errors.append(check.check_top(top, r, log_file, quiet))

    if True in errors:
        sys.exit(1)

    (nodes_dmp, names_dmp, prot_accession2taxid_file
     ) = check.inspect_taxonomy_folder(taxonomy_folder)
    (nr_file, diamond_database, fastaid2LCAtaxid_file,
     taxids_with_multiple_offspring_file
     ) = check.inspect_database_folder(database_folder)

    message = 'Ready to fly!\n\n-----------------\n'
    shared.give_user_feedback(message, log_file, quiet, show_time=False)

    # Start BAT.
    contig_names = shared.import_contig_names(bin_fasta, log_file, quiet)

    if 'run_prodigal' in step_list:
        shared.run_prodigal(path_to_prodigal, bin_fasta,
                            predicted_proteins_fasta, predicted_proteins_gff,
                            log_file, quiet)

    contig2ORFs = shared.import_ORFs(predicted_proteins_fasta, log_file, quiet)

    check.check_whether_ORFs_are_based_on_contigs(contig_names, contig2ORFs,
                                                  log_file, quiet)

    if 'run_diamond' in step_list:
        shared.run_diamond(path_to_diamond, diamond_database,
                           predicted_proteins_fasta, diamond_file, nproc,
                           sensitive, block_size, index_chunks, tmpdir, top,
                           log_file, quiet)

    (ORF2hits, all_hits) = shared.parse_diamond_file(diamond_file, one_minus_r,
                                                     log_file, quiet)

    (taxid2parent, taxid2rank) = tax.import_nodes(nodes_dmp, log_file, quiet)
    fastaid2LCAtaxid = tax.import_fastaid2LCAtaxid(fastaid2LCAtaxid_file,
                                                   all_hits, log_file, quiet)
    taxids_with_multiple_offspring = tax.import_taxids_with_multiple_offspring(
        taxids_with_multiple_offspring_file, log_file, quiet)

    message = ('BAT is flying! Files {0} and {1} are created.'
               ''.format(bin2classification_output_file, ORF2LCA_output_file))
    shared.give_user_feedback(message, log_file, quiet)

    number_of_classified_bins = 0

    with open(bin2classification_output_file,
              'w') as outf1, open(ORF2LCA_output_file, 'w') as outf2:
        outf1.write('# bin\tclassification\tnumber of ORFs in bin\t'
                    'number of ORFs classification is based on\tlineage\t'
                    'lineage scores\n')
        outf2.write('# ORF\tbin\tlineage\tbit-score\n')

        # The list contains only a single bin, but I keep the code like this
        # to make the code consistent across bin and bins.
        bin_list = [bin_fasta.rsplit('/', 1)[-1]]
        for bin_ in bin_list:
            LCAs_ORFs = []

            for contig in sorted(contig_names):
                if contig not in contig2ORFs:
                    continue

                for ORF in contig2ORFs[contig]:
                    if ORF not in ORF2hits:
                        outf2.write('{0}\t{1}\tORF has no hit to database.\n'
                                    ''.format(ORF, bin_))

                        continue

                    (taxid, top_bitscore) = tax.find_LCA_for_ORF(
                        ORF2hits[ORF], fastaid2LCAtaxid, taxid2parent)

                    if taxid.startswith('no taxid found'):
                        outf2.write('{0}\t{1}\t{2}\t{3}\n'
                                    ''.format(ORF, bin_, taxid, top_bitscore))
                    else:
                        lineage = tax.find_lineage(taxid, taxid2parent)
                        starred_lineage = tax.star_lineage(
                            lineage, taxids_with_multiple_offspring)

                        outf2.write('{0}\t{1}\t{2}\t{3}\n'
                                    ''.format(ORF, bin_,
                                              ';'.join(starred_lineage[::-1]),
                                              top_bitscore))

                    LCAs_ORFs.append((taxid, top_bitscore), )

            if len(LCAs_ORFs) == 0:
                outf1.write('{0}\tunclassified (no hits to database)\n'
                            ''.format(bin_))

                continue

            (lineages, lineages_scores,
             based_on_number_of_ORFs) = tax.find_weighted_LCA(
                 LCAs_ORFs, taxid2parent, f)

            if lineages == 'no ORFs with taxids found.':
                outf1.write('{0}\tunclassified '
                            '(hits not found in taxonomy files)\n'
                            ''.format(bin_))

                continue

            if lineages == 'no lineage whitelisted.':
                outf1.write('{0}\tunclassified '
                            '(no lineage reached minimum bit-score support)\n'
                            ''.format(bin_))

                continue

            # The bin has a valid classification.
            number_of_classified_bins += 1

            total_number_of_ORFs = sum([
                len(contig2ORFs[contig]) for contig in contig_names
                if contig in contig2ORFs
            ])

            for (i, lineage) in enumerate(lineages):
                starred_lineage = tax.star_lineage(
                    lineage, taxids_with_multiple_offspring)

                scores = [
                    '{0:.2f}'.format(score) for score in lineages_scores[i]
                ]

                if len(lineages) == 1:
                    # There is only one classification.
                    outf1.write('{0}\tclassified\t{1}\t{2}\t{3}\t{4}\n'
                                ''.format(bin_, total_number_of_ORFs,
                                          based_on_number_of_ORFs,
                                          ';'.join(starred_lineage[::-1]),
                                          ';'.join(scores[::-1])))
                else:
                    # There are multiple classifications.
                    outf1.write('{0}\tclassified ({1}/{2})'
                                '\t{3}\t{4}\t{5}\t{6}\n'
                                ''.format(bin_, i + 1, len(lineages),
                                          total_number_of_ORFs,
                                          based_on_number_of_ORFs,
                                          ';'.join(starred_lineage[::-1]),
                                          ';'.join(scores[::-1])))

    message = ('\n-----------------\n'
               '[{0}] BAT is done! {1}/1 bin classified.'
               ''.format(datetime.datetime.now(), number_of_classified_bins))
    shared.give_user_feedback(message, log_file, quiet, show_time=False)

    if f < 0.5:
        message = ('WARNING: since f is set to smaller than 0.5, one bin '
                   'may have multiple classifications.')
        shared.give_user_feedback(message, log_file, quiet, show_time=False)
Ejemplo n.º 2
0
def run():
    args = parse_arguments()

    message = '# CAT v{0}.'.format(about.__version__)
    shared.give_user_feedback(message,
                              args.log_file,
                              args.quiet,
                              show_time=False)

    # Check at which state to start.
    step_list = []
    if not args.proteins_fasta and not args.alignment_file:
        message = ('\n'
                   'BAT is running. Protein prediction, alignment, and bin '
                   'classification are carried out.')
        shared.give_user_feedback(message,
                                  args.log_file,
                                  args.quiet,
                                  show_time=False)

        step_list.append('predict_proteins')
        step_list.append('align')
    elif args.proteins_fasta and not args.alignment_file:
        message = (
            '\n'
            'BAT is running. Since a predicted protein fasta is supplied, '
            'only alignment and bin classification are carried out.')
        shared.give_user_feedback(message,
                                  args.log_file,
                                  args.quiet,
                                  show_time=False)

        step_list.append('align')
    elif args.proteins_fasta and args.alignment_file:
        message = ('\n'
                   'BAT is running. Since a predicted protein fasta and '
                   'alignment file are supplied, only bin classification is '
                   'carried out.')
        shared.give_user_feedback(message,
                                  args.log_file,
                                  args.quiet,
                                  show_time=False)
    elif not args.proteins_fasta and args.alignment_file:
        message = ('if you want BAT to directly classify a single bin, you '
                   'should not only supply an alignment table but also a '
                   'predicted protein fasta file with argument '
                   '[-p / --proteins].')
        shared.give_user_feedback(message,
                                  args.log_file,
                                  args.quiet,
                                  error=True)

        sys.exit(1)

    step_list.append('classify')

    # Print variables.
    message = ('Rarw!\n\n'
               'Supplied command: {0}\n\n'
               'Bin fasta: {1}\n'
               'Taxonomy folder: {2}\n'
               'Database folder: {3}\n'
               'Parameter r: {4}\n'
               'Parameter f: {5}\n'
               'Log file: {6}\n\n'
               '-----------------\n'.format(' '.join(sys.argv), args.bin_fasta,
                                            args.taxonomy_folder,
                                            args.database_folder, int(args.r),
                                            float(args.f), args.log_file))
    shared.give_user_feedback(message,
                              args.log_file,
                              args.quiet,
                              show_time=False)

    # Check binaries, output files, taxonomy folder and database folder, and
    # set variables.
    message = 'Doing some pre-flight checks first.'
    shared.give_user_feedback(message,
                              args.log_file,
                              args.quiet,
                              show_time=False)

    errors = []

    errors.append(
        check.check_out_prefix(args.out_prefix, args.log_file, args.quiet))

    errors.append(
        check.check_bin_fasta(args.bin_fasta, args.log_file, args.quiet))

    if 'predict_proteins' in step_list:
        errors.append(
            check.check_prodigal_binaries(args.path_to_prodigal, args.log_file,
                                          args.quiet))

        setattr(args, 'proteins_fasta',
                '{0}.predicted_proteins.faa'.format(args.out_prefix))
        setattr(args, 'proteins_gff',
                '{0}.predicted_proteins.gff'.format(args.out_prefix))

        if not args.force:
            errors.append(
                check.check_output_file(args.proteins_fasta, args.log_file,
                                        args.quiet))
            errors.append(
                check.check_output_file(args.proteins_gff, args.log_file,
                                        args.quiet))

    if 'align' in step_list:
        errors.append(
            check.check_diamond_binaries(args.path_to_diamond, args.log_file,
                                         args.quiet))

        setattr(args, 'alignment_file',
                '{0}.alignment.diamond'.format(args.out_prefix))

        if not args.force:
            errors.append(
                check.check_output_file(args.alignment_file, args.log_file,
                                        args.quiet))

    errors.append(
        check.check_folders_for_run(args.taxonomy_folder, args.nodes_dmp,
                                    args.names_dmp, args.database_folder,
                                    args.diamond_database,
                                    args.fastaid2LCAtaxid_file,
                                    args.taxids_with_multiple_offspring_file,
                                    step_list, args.log_file, args.quiet))

    setattr(args, 'bin2classification_output_file',
            '{0}.bin2classification.txt'.format(args.out_prefix))
    setattr(args, 'ORF2LCA_output_file',
            '{0}.ORF2LCA.txt'.format(args.out_prefix))

    if not args.force:
        errors.append(
            check.check_output_file(args.bin2classification_output_file,
                                    args.log_file, args.quiet))
        errors.append(
            check.check_output_file(args.ORF2LCA_output_file, args.log_file,
                                    args.quiet))

    if 'predict_proteins' not in step_list:
        errors.append(
            check.check_fasta(args.proteins_fasta, args.log_file, args.quiet))

    if 'align' in step_list:
        errors.append(
            check.check_top(args.top, args.r, args.log_file, args.quiet))

    # Print all variables.
    shared.print_variables(args, step_list)

    if True in errors:
        sys.exit(1)

    message = 'Ready to fly!\n\n-----------------\n'
    shared.give_user_feedback(message,
                              args.log_file,
                              args.quiet,
                              show_time=False)

    # Start BAT.
    contig_names = shared.import_contig_names(args.bin_fasta, args.log_file,
                                              args.quiet)

    if 'predict_proteins' in step_list:
        shared.run_prodigal(args.path_to_prodigal, args.bin_fasta,
                            args.proteins_fasta, args.proteins_gff,
                            args.log_file, args.quiet)

    contig2ORFs = shared.import_ORFs(args.proteins_fasta, args.log_file,
                                     args.quiet)

    check.check_whether_ORFs_are_based_on_contigs(contig_names, contig2ORFs,
                                                  args.log_file, args.quiet)

    if 'align' in step_list:
        shared.run_diamond(args)

    (ORF2hits,
     all_hits) = shared.parse_tabular_alignment(args.alignment_file,
                                                args.one_minus_r,
                                                args.log_file, args.quiet)

    (taxid2parent, taxid2rank) = tax.import_nodes(args.nodes_dmp,
                                                  args.log_file, args.quiet)
    fastaid2LCAtaxid = tax.import_fastaid2LCAtaxid(args.fastaid2LCAtaxid_file,
                                                   all_hits, args.log_file,
                                                   args.quiet)
    taxids_with_multiple_offspring = tax.import_taxids_with_multiple_offspring(
        args.taxids_with_multiple_offspring_file, args.log_file, args.quiet)

    message = 'BAT is flying! Files {0} and {1} are created.'.format(
        args.bin2classification_output_file, args.ORF2LCA_output_file)
    shared.give_user_feedback(message, args.log_file, args.quiet)

    n_classified_bins = 0

    with open(args.bin2classification_output_file,
              'w') as outf1, open(args.ORF2LCA_output_file, 'w') as outf2:
        outf1.write('# bin\tclassification\treason\tlineage\tlineage scores\n')

        outf2.write('# ORF\tbin\tnumber of hits\tlineage\ttop bit-score\n')

        # The list contains only a single bin, but I keep the code like this
        # to make the code consistent across bin and bins.
        bin_list = [args.bin_fasta.rsplit('/', 1)[-1]]
        for bin_ in bin_list:
            LCAs_ORFs = []

            for contig in sorted(contig_names):
                if contig not in contig2ORFs:
                    continue

                for ORF in contig2ORFs[contig]:
                    if ORF not in ORF2hits:
                        outf2.write('{0}\t{1}\tORF has no hit to database\n'
                                    ''.format(ORF, bin_))

                        continue

                    n_hits = len(ORF2hits[ORF])

                    (taxid, top_bitscore) = tax.find_LCA_for_ORF(
                        ORF2hits[ORF], fastaid2LCAtaxid, taxid2parent)

                    if taxid.startswith('no taxid found'):
                        outf2.write('{0}\t{1}\t{2}\t{3}\t{4}\n'.format(
                            ORF, bin_, n_hits, taxid, top_bitscore))
                    else:
                        lineage = tax.find_lineage(taxid, taxid2parent)

                        if not args.no_stars:
                            lineage = tax.star_lineage(
                                lineage, taxids_with_multiple_offspring)

                        outf2.write('{0}\t{1}\t{2}\t{3}\t{4}\n'.format(
                            ORF, bin_, n_hits, ';'.join(lineage[::-1]),
                            top_bitscore))

                    LCAs_ORFs.append((taxid, top_bitscore), )

            if len(LCAs_ORFs) == 0:
                outf1.write('{0}\tno taxid assigned\tno hits to database\n'
                            ''.format(bin_))

                continue

            (lineages, lineages_scores,
             based_on_n_ORFs) = tax.find_weighted_LCA(LCAs_ORFs, taxid2parent,
                                                      args.f)

            if lineages == 'no ORFs with taxids found.':
                outf1.write('{0}\tno taxid assigned\t'
                            'hits not found in taxonomy files\n'.format(bin_))

                continue

            if lineages == 'no lineage whitelisted.':
                outf1.write('{0}\tno taxid assigned\t'
                            'no lineage reached minimum bit-score support\n'
                            ''.format(bin_))

                continue

            # The bin has a valid classification.
            n_classified_bins += 1

            total_n_ORFs = sum([
                len(contig2ORFs[contig]) for contig in contig_names
                if contig in contig2ORFs
            ])

            for (i, lineage) in enumerate(lineages):
                if not args.no_stars:
                    lineage = tax.star_lineage(lineage,
                                               taxids_with_multiple_offspring)

                scores = [
                    '{0:.2f}'.format(score) for score in lineages_scores[i]
                ]

                if len(lineages) == 1:
                    # There is only one classification.
                    outf1.write('{0}\t'
                                'taxid assigned\t'
                                'based on {1}/{2} ORFs\t'
                                '{3}\t'
                                '{4}\n'.format(bin_, based_on_n_ORFs,
                                               total_n_ORFs,
                                               ';'.join(lineage[::-1]),
                                               ';'.join(scores[::-1])))
                else:
                    # There are multiple classifications.
                    outf1.write('{0}\t'
                                'taxid assigned ({1}/{2})\t'
                                'based on {3}/{4} ORFs\t'
                                '{5}\t'
                                '{6}\n'.format(bin_, i + 1, len(lineages),
                                               based_on_n_ORFs, total_n_ORFs,
                                               ';'.join(lineage[::-1]),
                                               ';'.join(scores[::-1])))

    message = ('\n-----------------\n'
               '{0} BAT is done! {1}/1 bin has taxonomy assigned.'.format(
                   shared.timestamp(), n_classified_bins))
    shared.give_user_feedback(message,
                              args.log_file,
                              args.quiet,
                              show_time=False)

    if args.f < 0.5:
        message = ('WARNING: since f is set to smaller than 0.5, one bin '
                   'may have multiple classifications.')
        shared.give_user_feedback(message,
                                  args.log_file,
                                  args.quiet,
                                  show_time=False)

    return