コード例 #1
0
ファイル: summarise.py プロジェクト: linsalrob/CAT
def summarise(args):
    (input_file, output_file, contigs_fasta, force,
     quiet) = check.convert_arguments(args)

    if contigs_fasta == None:
        summarise_bins(input_file, output_file, force, quiet)
    else:
        summarise_contigs(input_file, output_file, contigs_fasta, force, quiet)
コード例 #2
0
def contigs(args):
    step_list = []

    (contigs_fasta, database_folder, taxonomy_folder, r, one_minus_r, f,
     out_prefix, predicted_proteins_fasta, diamond_file, path_to_prodigal,
     path_to_diamond, no_stars, force, quiet, no_log, nproc, sensitive,
     block_size, index_chunks, tmpdir, top) = check.convert_arguments(args)

    if no_log:
        log_file = None
    else:
        # Check out_prefix already as the log file needs to be written to a
        # valid location.
        error = check.check_out_prefix(out_prefix, None, quiet)
        if error:
            sys.exit(1)

        log_file = '{0}.log'.format(out_prefix)
        with open(log_file, 'w') as outf1:
            pass

    message = '# CAT v{0}.'.format(about.__version__)
    shared.give_user_feedback(message, log_file, quiet, show_time=False)

    # Check at which state to start.
    if predicted_proteins_fasta is None and diamond_file is None:
        message = ('\n'
                   'CAT is running. Protein prediction, alignment, and contig '
                   'classification are carried out.\n'
                   'Rarw!\n\n'
                   'Supplied command: {0}\n\n'
                   'Contigs fasta: {1}\n'
                   'Taxonomy folder: {2}/\n'
                   'Database folder: {3}/\n'
                   'Parameter r: {4}\n'
                   'Parameter f: {5}\n'
                   'Log file: {6}\n\n'
                   '-----------------\n'.format(' '.join(sys.argv),
                                                contigs_fasta, taxonomy_folder,
                                                database_folder, args.r,
                                                args.f, log_file))
        shared.give_user_feedback(message, log_file, quiet, show_time=False)

        step_list.append('run_prodigal')
        step_list.append('run_diamond')
    elif (predicted_proteins_fasta is not None and diamond_file is None):
        message = ('\n'
                   'CAT is running. Since a predicted protein fasta is '
                   'supplied, only alignment and contig classification are '
                   'carried out.\n'
                   'Rarw!\n\n'
                   'Supplied command: {0}\n\n'
                   'Contigs fasta: {1}\n'
                   'Taxonomy folder: {2}/\n'
                   'Database folder: {3}/\n'
                   'Parameter r: {4}\n'
                   'Parameter f: {5}\n'
                   'Log file: {6}\n\n'
                   '-----------------\n'.format(' '.join(sys.argv),
                                                contigs_fasta, taxonomy_folder,
                                                database_folder, args.r,
                                                args.f, log_file))
        shared.give_user_feedback(message, log_file, quiet, show_time=False)

        step_list.append('run_diamond')
    elif (predicted_proteins_fasta is not None and diamond_file is not None):
        message = ('\n'
                   'CAT is running. Since a predicted protein fasta and '
                   'DIAMOND alignment file are supplied, only contig '
                   'classification is carried out.\n'
                   'Rarw!\n\n'
                   'Supplied command: {0}\n\n'
                   'Contigs fasta: {1}\n'
                   'Taxonomy folder: {2}/\n'
                   'Database folder: {3}/\n'
                   'Parameter r: {4}\n'
                   'Parameter f: {5}\n'
                   'Log file: {6}\n\n'
                   '-----------------\n'.format(' '.join(sys.argv),
                                                contigs_fasta, taxonomy_folder,
                                                database_folder, args.r,
                                                args.f, log_file))
        shared.give_user_feedback(message, log_file, quiet, show_time=False)
    elif (predicted_proteins_fasta is None and diamond_file is not None):
        message = ('ERROR: if you want CAT to directly do the classification, '
                   'you should not only supply a DIAMOND alignment table but '
                   'also a predicted protein fasta file with argument '
                   '[-p / --proteins].')
        shared.give_user_feedback(message, log_file, quiet, error=True)

        sys.exit(1)

    # Check binaries, output files, taxonomy folder and database folder, and
    # set parameters.
    message = 'Doing some pre-flight checks first.'
    shared.give_user_feedback(message, log_file, quiet, show_time=False)

    errors = []

    errors.append(check.check_out_prefix(out_prefix, log_file, quiet))

    if 'run_prodigal' in step_list:
        errors.append(
            check.check_prodigal_binaries(path_to_prodigal, log_file, quiet))

        predicted_proteins_fasta = ('{0}.predicted_proteins.faa'
                                    ''.format(out_prefix))
        predicted_proteins_gff = ('{0}.predicted_proteins.gff'
                                  ''.format(out_prefix))

        if not force:
            errors.append(
                check.check_output_file(predicted_proteins_fasta, log_file,
                                        quiet))
            errors.append(
                check.check_output_file(predicted_proteins_gff, log_file,
                                        quiet))

    if 'run_diamond' in step_list:
        errors.append(
            check.check_diamond_binaries(path_to_diamond, log_file, quiet))

        diamond_file = '{0}.alignment.diamond'.format(out_prefix)

        if not force:
            errors.append(
                check.check_output_file(diamond_file, log_file, quiet))
    else:
        diamond_file = diamond_file

    errors.append(
        check.check_folders_for_run(taxonomy_folder, database_folder,
                                    step_list, log_file, quiet))

    contig2classification_output_file = ('{0}.contig2classification.txt'
                                         ''.format(out_prefix))
    ORF2LCA_output_file = '{0}.ORF2LCA.txt'.format(out_prefix)

    if not force:
        errors.append(
            check.check_output_file(contig2classification_output_file,
                                    log_file, quiet))
        errors.append(
            check.check_output_file(ORF2LCA_output_file, log_file, quiet))

    if 'run_prodigal' not in step_list:
        if not check.check_whether_file_is_fasta(predicted_proteins_fasta):
            message = ('ERROR: {0} is not a fasta file.'
                       ''.format(predicted_proteins_fasta))
            shared.give_user_feedback(message, log_file, quiet, error=True)

            errors.append(True)

    errors.append(check.check_top(top, r, log_file, quiet))

    if True in errors:
        sys.exit(1)

    (nodes_dmp, names_dmp, prot_accession2taxid_file
     ) = check.inspect_taxonomy_folder(taxonomy_folder)
    (nr_file, diamond_database, fastaid2LCAtaxid_file,
     taxids_with_multiple_offspring_file
     ) = check.inspect_database_folder(database_folder)

    message = 'Ready to fly!\n\n-----------------\n'
    shared.give_user_feedback(message, log_file, quiet, show_time=False)

    # Start CAT.
    contig_names = shared.import_contig_names(contigs_fasta, log_file, quiet)

    if 'run_prodigal' in step_list:
        shared.run_prodigal(path_to_prodigal, contigs_fasta,
                            predicted_proteins_fasta, predicted_proteins_gff,
                            log_file, quiet)

    contig2ORFs = shared.import_ORFs(predicted_proteins_fasta, log_file, quiet)

    check.check_whether_ORFs_are_based_on_contigs(contig_names, contig2ORFs,
                                                  log_file, quiet)

    if 'run_diamond' in step_list:
        shared.run_diamond(path_to_diamond, diamond_database,
                           predicted_proteins_fasta, diamond_file, nproc,
                           sensitive, block_size, index_chunks, tmpdir, top,
                           log_file, quiet)

    (ORF2hits, all_hits) = shared.parse_diamond_file(diamond_file, one_minus_r,
                                                     log_file, quiet)

    (taxid2parent, taxid2rank) = tax.import_nodes(nodes_dmp, log_file, quiet)
    fastaid2LCAtaxid = tax.import_fastaid2LCAtaxid(fastaid2LCAtaxid_file,
                                                   all_hits, log_file, quiet)
    taxids_with_multiple_offspring = tax.import_taxids_with_multiple_offspring(
        taxids_with_multiple_offspring_file, log_file, quiet)

    message = ('CAT is spinning! Files {0} and {1} are created.'
               ''.format(contig2classification_output_file,
                         ORF2LCA_output_file))
    shared.give_user_feedback(message, log_file, quiet)

    number_of_classified_contigs = 0

    with open(contig2classification_output_file,
              'w') as outf1, open(ORF2LCA_output_file, 'w') as outf2:
        outf1.write('# contig\tclassification\treason\tlineage\t'
                    'lineage scores\n')
        outf2.write('# ORF\tlineage\tbit-score\n')

        for contig in sorted(contig_names):
            if contig not in contig2ORFs:
                outf1.write('{0}\tunclassified\tno ORFs found\n'
                            ''.format(contig))

                continue

            LCAs_ORFs = []

            for ORF in contig2ORFs[contig]:
                if ORF not in ORF2hits:
                    outf2.write('{0}\tORF has no hit to database\n'
                                ''.format(ORF))

                    continue

                (taxid,
                 top_bitscore) = tax.find_LCA_for_ORF(ORF2hits[ORF],
                                                      fastaid2LCAtaxid,
                                                      taxid2parent)

                if taxid.startswith('no taxid found'):
                    outf2.write('{0}\t{1}\t{2}\n'.format(
                        ORF, taxid, top_bitscore))
                else:
                    lineage = tax.find_lineage(taxid, taxid2parent)

                    if not no_stars:
                        lineage = tax.star_lineage(
                            lineage, taxids_with_multiple_offspring)

                    outf2.write('{0}\t{1}\t{2}\n'
                                ''.format(ORF, ';'.join(lineage[::-1]),
                                          top_bitscore))

                LCAs_ORFs.append((taxid, top_bitscore), )

            if len(LCAs_ORFs) == 0:
                outf1.write('{0}\tunclassified\tno hits to database\n'
                            ''.format(contig))

                continue

            (lineages, lineages_scores,
             based_on_number_of_ORFs) = tax.find_weighted_LCA(
                 LCAs_ORFs, taxid2parent, f)

            if lineages == 'no ORFs with taxids found.':
                outf1.write('{0}\tunclassified\t'
                            'hits not found in taxonomy files\n'
                            ''.format(contig))

                continue

            if lineages == 'no lineage whitelisted.':
                outf1.write('{0}\tunclassified\t'
                            'no lineage reached minimum bit-score support\n'
                            ''.format(contig))

                continue

            # The contig has a valid classification.
            number_of_classified_contigs += 1

            for (i, lineage) in enumerate(lineages):
                if not no_stars:
                    lineage = tax.star_lineage(lineage,
                                               taxids_with_multiple_offspring)
                scores = [
                    '{0:.2f}'.format(score) for score in lineages_scores[i]
                ]

                if len(lineages) == 1:
                    # There is only one classification.
                    outf1.write('{0}\tclassified\t'
                                'based on {1}/{2} ORFs\t{3}\t{4}\n'
                                ''.format(contig, based_on_number_of_ORFs,
                                          len(contig2ORFs[contig]),
                                          ';'.join(lineage[::-1]),
                                          ';'.join(scores[::-1])))
                else:
                    # There are multiple classifications.
                    outf1.write('{0}\tclassified ({1}/{2})\t'
                                'based on {3}/{4} ORFs\t{5}\t{6}\n'
                                ''.format(contig, i + 1, len(lineages),
                                          based_on_number_of_ORFs,
                                          len(contig2ORFs[contig]),
                                          ';'.join(lineage[::-1]),
                                          ';'.join(scores[::-1])))

    message = ('\n-----------------\n\n'
               '[{0}] CAT is done! {1}/{2} contigs classified.'
               ''.format(datetime.datetime.now(), number_of_classified_contigs,
                         len(contig_names)))
    shared.give_user_feedback(message, log_file, quiet, show_time=False)

    if f < 0.5:
        message = ('\nWARNING: since f is set to smaller than 0.5, one '
                   'contig may have multiple classifications.')
        shared.give_user_feedback(message, log_file, quiet, show_time=False)
コード例 #3
0
ファイル: prepare.py プロジェクト: samnooij/CAT
def run_existing(args, date):
    step_list = []

    (database_folder,
     taxonomy_folder,
     path_to_diamond,
     quiet,
     no_log,
     nproc) = check.convert_arguments(args)
    
    if no_log:
        log_file = None
    else:
        log_file = '{0}.CAT_prepare.existing.log'.format(date)
        with open(log_file, 'w') as outf1:
            pass
        
    message = '# CAT v{0}.'.format(about.__version__)
    shared.give_user_feedback(message, log_file, quiet, show_time=False)
    
    message = ('\n'
               'CAT prepare is running, constructing only parts of the '
               'database that are missing. Rawr!\n\n'
               'WARNING: CAT prepare at this point does not check whether the '
               'existing files are OK or corrupted, only if they are there.\n'
               'WARNING: note that the database and taxonomy files should be '
               'downloaded preferably at the same date.\n'
               'WARNING: preparing the database files may take a couple of '
               'hours.\n\n'
               'Supplied command: {0}\n\n'
               'Supplied command: {0}\n\n'
               'Taxonomy folder: {1}/\n'
               'Database folder: {2}/\n'
               'Log file: {3}\n\n'
               '-----------------\n'.format(' '.join(sys.argv),
                                            taxonomy_folder,
                                            database_folder,
                                            log_file))
    shared.give_user_feedback(message, log_file, quiet, show_time=False)
    
    message = 'Doing some pre-flight checks first.'
    shared.give_user_feedback(message, log_file, quiet, show_time=False)
    
    # Check DIAMOND path.
    error = check.check_diamond_binaries(path_to_diamond, log_file, quiet)
    if error:
        sys.exit(1)

    # Check taxonomy folder.
    taxonomy_folder_inspect = check.inspect_taxonomy_folder(taxonomy_folder)
    if taxonomy_folder_inspect == [None]:
        message = ('Taxonomy folder not found. Directory will be created '
                   'fresh and taxonomy files downloaded to it.')
        shared.give_user_feedback(message, log_file, quiet)
        
        nodes_dmp = None
        names_dmp = None
        prot_accession2taxid_file = None
    else:
        (nodes_dmp,
         names_dmp,
         prot_accession2taxid_file) = taxonomy_folder_inspect

        message = ('Taxonomy folder found.')
        shared.give_user_feedback(message, log_file, quiet)
        
    if ((nodes_dmp is None and names_dmp is not None) or
        (nodes_dmp is not None and names_dmp is None)):
        message = ('ERROR: CAT prepare did not find both nodes.dmp and '
                   'names.dmp in the taxonomy folder. They should be '
                   'downloaded together. Remove {0} and try again.'
                   ''.format([file_ for file_ in (nodes_dmp, names_dmp) if
                              file_ is not None][0]))
        shared.give_user_feedback(message, log_file, quiet, error=True)

        sys.exit(1)

    if nodes_dmp is None and names_dmp is None:
        message = ('Nodes.dmp and names.dmp will be downloaded to taxonomy '
                   'folder.')
        shared.give_user_feedback(message, log_file, quiet)

        step_list.append('download_taxonomy_files')
    else:
        message = 'Nodes.dmp found: {0}.'.format(nodes_dmp)
        shared.give_user_feedback(message, log_file, quiet)

        message = 'Names.dmp found: {0}.'.format(names_dmp)
        shared.give_user_feedback(message, log_file, quiet)
        
    if prot_accession2taxid_file is None:
        message = ('Prot.accession2taxid file will be downloaded to taxonomy '
                   'folder.')
        shared.give_user_feedback(message, log_file, quiet)
        
        prot_accession2taxid_file = ('{0}/{1}.prot.accession2taxid.gz'
                                     ''.format(taxonomy_folder, date))
        step_list.append('download_prot_accession2taxid_file')
    else:
        message = ('Prot.accession2taxid file found: {0}.'
                   ''.format(prot_accession2taxid_file))
        shared.give_user_feedback(message, log_file, quiet)

    # Check database folder.
    database_folder_inspect = check.inspect_database_folder(database_folder)
    if database_folder_inspect == [None]:
        message = ('Database folder not found. Directory will be created '
                   'fresh and necessary database files will be downloaded to '
                   'it / constructed in it.')
        shared.give_user_feedback(message, log_file, quiet)
        
        nr_file = None
        diamond_database = None
        fastaid2LCAtaxid_file = None
        taxids_with_multiple_offspring_file = None
    else:
        (nr_file,
         diamond_database,
         fastaid2LCAtaxid_file,
         taxids_with_multiple_offspring_file) = database_folder_inspect

        message = ('Database folder found.')
        shared.give_user_feedback(message, log_file, quiet)

    tmp = (diamond_database,
           fastaid2LCAtaxid_file,
           taxids_with_multiple_offspring_file)
    if (nr_file is None and
        None in tmp and
        not all([file_ is None for file_ in tmp])):
        message = ('ERROR: Database folder does not contain an nr file, while '
                   'some but not all of the downstream files that depend on '
                   'it are present. In order to prevent strange bugs from '
                   'arising, please remove all files from the database folder '
                   'and try again.')
        shared.give_user_feedback(message, log_file, quiet, error=True)
        
        sys.exit(1)

    if (fastaid2LCAtaxid_file is None and
        taxids_with_multiple_offspring_file is not None):
        message = ('ERROR: file taxids_with_multiple_offspring exists but '
                   'fastaid2LCAtaxid is not found in the database folder '
                   'whilst taxids_with_multiple_offspring depends on it. In '
                   'order to prevent strange bugs from arising, please remove '
                   '{0} and try again.'
                   ''.format(taxids_with_multiple_offspring_file))
        shared.give_user_feedback(message, log_file, quiet, error=True)

        sys.exit(1)
        
    whether_to_download_nr = True
    if (nr_file is None and
        diamond_database is not None and
        fastaid2LCAtaxid_file is not None and
        taxids_with_multiple_offspring_file is not None):
        whether_to_download_nr = False
        
    if nr_file is None:
        if whether_to_download_nr:
            message = 'Nr file will be downloaded to database folder.'
            shared.give_user_feedback(message, log_file, quiet)
            
            nr_file = '{0}/{1}.nr.gz'.format(database_folder, date)
            step_list.append('download_nr')
        else:
            pass
    else:
        message = 'Nr file found: {0}.'.format(nr_file)
        shared.give_user_feedback(message, log_file, quiet)

    if diamond_database is None:
        message = ('DIAMOND database will be constructed from the nr file.'
                   ''.format(nr_file))
        shared.give_user_feedback(message, log_file, quiet)
        
        diamond_database_prefix = '{0}/{1}.nr'.format(database_folder, date)
        step_list.append('make_diamond_database')
    else:
        message = 'DIAMOND database found: {0}.'.format(diamond_database)
        shared.give_user_feedback(message, log_file, quiet)
        
        diamond_database_prefix = diamond_database.rsplit('.dmnd', 1)[0]

    if fastaid2LCAtaxid_file is None:
        message = 'File fastaid2LCAtaxid will be created.'
        shared.give_user_feedback(message, log_file, quiet)
        
        fastaid2LCAtaxid_file = ('{0}/{1}.nr.fastaid2LCAtaxid'
                                 ''.format(database_folder, date))
        step_list.append('make_fastaid2LCAtaxid_file')
    else:
        message = ('Fastaid2LCAtaxid found: {0}.'
                   ''.format(fastaid2LCAtaxid_file))
        shared.give_user_feedback(message, log_file, quiet)

    if taxids_with_multiple_offspring_file is None:
        message = 'File taxids_with_multiple_offspring will be created.'
        shared.give_user_feedback(message, log_file, quiet)

        taxids_with_multiple_offspring_file = ('{0}/{1}.nr.taxids_with_'
                                               'multiple_offspring'
                                               ''.format(database_folder,
                                                         date))
        step_list.append('make_taxids_with_multiple_offspring_file')
    else:
        message = ('Taxids_with_multiple_offspring found: {0}.'
                   ''.format(taxids_with_multiple_offspring_file))
        shared.give_user_feedback(message, log_file, quiet)

    if nr_file is None and whether_to_download_nr is False:
        # This is pushed here just for the logic of the user.
        message = ('NOTE: Database folder contains all the necessary files '
                   'except for nr.gz. Since nr.gz is not used by CAT or BAT, '
                   'this is fine.')
        shared.give_user_feedback(message, log_file, quiet)

    if taxonomy_folder_inspect == [None] and database_folder_inspect == [None]:
        message = ('\n-----------------\n\n'
                   'WARNING: no taxonomy or database folder was found. CAT '
                   'prepare will create them fresh. Are you sure you are '
                   'linking to existing folders?')
        shared.give_user_feedback(message, log_file, quiet, show_time=False)
        
    if ('make_fastaid2LCAtaxid_file' in step_list or
        'make_taxids_with_multiple_offspring_file' in step_list):
        # Check memory.
        min_mem = 100
        (total_memory, error) = check.check_memory(min_mem)
        
        if error:
            message = ('ERROR: at least {0}GB of memory is needed for the '
                       'database construction. {1}GB is found on your system. '
                       'You can either try to find a machine with more '
                       'memory, or download preconstructed database files '
                       'from tbb.bio.uu.nl/bastiaan/CAT_prepare/.'
                       ''.format(min_mem, total_memory))
            shared.give_user_feedback(message, log_file, quiet, error=True)
            
            sys.exit(1)
            
    if len(step_list) == 0:
        message = ('All necessary files are found. Existing database does not '
                   'need any more work...')
        shared.give_user_feedback(message, log_file, quiet, show_time=False)
    else:
        message = 'Ready to fly!\n\n-----------------\n'
        shared.give_user_feedback(message, log_file, quiet, show_time=False)
        
    if taxonomy_folder_inspect == [None]:
        os.mkdir(taxonomy_folder)
        message = 'Taxonomy folder {0} is created.'.format(taxonomy_folder)
        shared.give_user_feedback(message, log_file, quiet)

    if database_folder_inspect == [None]:
        os.mkdir(database_folder)
        message = 'Database folder {0} is created.'.format(database_folder)
        shared.give_user_feedback(message, log_file, quiet)
        
    prepare(step_list,
            taxonomy_folder,
            database_folder,
            date,
            prot_accession2taxid_file,
            nr_file,
            path_to_diamond,
            diamond_database_prefix,
            nproc,
            fastaid2LCAtaxid_file,
            taxids_with_multiple_offspring_file,
            log_file,
            quiet)
コード例 #4
0
ファイル: prepare.py プロジェクト: samnooij/CAT
def run_fresh(args, date):
    (database_folder,
     taxonomy_folder,
     path_to_diamond,
     quiet,
     no_log,
     nproc) = check.convert_arguments(args)
    
    if no_log:
        log_file = None
    else:
        log_file = '{0}.CAT_prepare.fresh.log'.format(date)
        with open(log_file, 'w') as outf1:
            pass
        
    message = '# CAT v{0}.'.format(about.__version__)
    shared.give_user_feedback(message, log_file, quiet, show_time=False)
    
    message = ('\n'
               'CAT prepare is running, constructing a fresh database.\n'
               'Rawr!\n\n'
               'WARNING: preparing the database files may take a couple of '
               'hours.\n\n'
               'Supplied command: {0}\n\n'
               'Taxonomy folder: {1}/\n'
               'Database folder: {2}/\n'
               'Log file: {3}\n\n'
               '-----------------\n'.format(' '.join(sys.argv),
                                            taxonomy_folder,
                                            database_folder,
                                            log_file))
    shared.give_user_feedback(message, log_file, quiet, show_time=False)
    
    # Check diamond path.
    error = check.check_diamond_binaries(path_to_diamond, log_file, quiet)
    if error:
        sys.exit(1)
    
    # Check taxonomy folder.
    taxonomy_folder_inspect = check.inspect_taxonomy_folder(taxonomy_folder)
    if taxonomy_folder_inspect != [None]:
        if len([file for file in taxonomy_folder_inspect if
                file is not None]) > 0:
            message = ('ERROR: taxonomy folder {0} exists already and '
                       'contains taxonomy files. Please supply a novel or '
                       'empty folder if you want to start fresh, or run '
                       'CAT prepare --existing.'
                       ''.format(taxonomy_folder))
            shared.give_user_feedback(message, log_file, quiet, error=True)

            sys.exit(1)

        message = ('Taxonomy folder exists already. Taxonomy files will be '
                   'downloaded to it.')
        shared.give_user_feedback(message, log_file, quiet)
            
    database_folder_inspect = check.inspect_database_folder(database_folder)

    # Check database folder.
    if database_folder_inspect != [None]:
        if len([file_ for file_ in database_folder_inspect if
                file_ is not None]) > 0:
            message = ('ERROR: database folder {0} exists already and '
                       'contains database files. Please supply a novel or '
                       'empty folder if you want to start fresh.'
                       ''.format(database_folder))
            shared.give_user_feedback(message, log_file, quiet, error=True)

            sys.exit(1)

        message = ('Database folder exists already. Database file will be '
                   'downloaded to it / constructed in it.')
        shared.give_user_feedback(message, log_file, quiet)
        
    # Check memory.
    min_mem = 100
    (total_memory, error) = check.check_memory(min_mem)

    if error:
        message = ('ERROR: at least {0}GB of memory is needed for a fresh '
                   'database construction. {1}GB is found on your system. You '
                   'can either try to find a machine with more memory, or '
                   'download preconstructed database files from '
                   'tbb.bio.uu.nl/bastiaan/CAT_prepare/.'
                   ''.format(min_mem, total_memory))
        shared.give_user_feedback(message, log_file, quiet, error=True)

        sys.exit(1)
        
    if taxonomy_folder_inspect == [None]:
        os.mkdir(taxonomy_folder)

        message = '{0} is created.'.format(taxonomy_folder)
        shared.give_user_feedback(message, log_file, quiet)
    if database_folder_inspect == [None]:
        os.mkdir(database_folder)

        message = '{0} is created.'.format(database_folder)
        shared.give_user_feedback(message, log_file, quiet)
        
    prot_accession2taxid_file = ('{0}/{1}.prot.accession2taxid.gz'
                                 ''.format(taxonomy_folder, date))
    nr_file = '{0}/{1}.nr.gz'.format(database_folder, date)
    diamond_database_prefix = '{0}/{1}.nr'.format(database_folder, date)
    fastaid2LCAtaxid_file = ('{0}/{1}.nr.fastaid2LCAtaxid'
                             ''.format(database_folder, date))
    taxids_with_multiple_offspring_file = ('{0}/{1}.nr.taxids_with_multiple_'
                                           'offspring'
                                           ''.format(database_folder, date))
    
    step_list = ['download_taxonomy_files',
                 'download_prot_accession2taxid_file',
                 'download_nr',
                 'make_diamond_database',
                 'make_fastaid2LCAtaxid_file',
                 'make_taxids_with_multiple_offspring_file']
    
    prepare(step_list,
            taxonomy_folder,
            database_folder,
            date,
            prot_accession2taxid_file,
            nr_file,
            path_to_diamond,
            diamond_database_prefix,
            nproc,
            fastaid2LCAtaxid_file,
            taxids_with_multiple_offspring_file,
            log_file,
            quiet)
コード例 #5
0
ファイル: add_names.py プロジェクト: senaj/CAT
def add_names(args):
    (input_file,
     output_file,
     taxonomy_folder,
     only_official,
     force,
     quiet) = check.convert_arguments(args)

    # Currently add_names does not allow for a log file.
    log_file = None
    
    message = '# CAT v{0}.'.format(about.__version__)
    shared.give_user_feedback(message, log_file, quiet, show_time=False)

    errors = []

    errors.append(check.check_input_file(input_file, log_file, quiet))

    if not force:
        errors.append(check.check_output_file(output_file, log_file, quiet))

    if True in errors:
        sys.exit(1)
        
    (nodes_dmp,
     names_dmp,
     prot_accession2taxid_file) = check.inspect_taxonomy_folder(taxonomy_folder)

    (taxid2parent, taxid2rank) = tax.import_nodes(nodes_dmp, log_file, quiet)
    taxid2name = tax.import_names(names_dmp, log_file, quiet)

    message = 'Appending names...'
    shared.give_user_feedback(message, log_file, quiet)

    with open(input_file, 'r') as f1:
        for line in f1:
            if line.startswith('#'):
                line = line.rstrip().split('\t')

                try:
                    lineage_index = line.index('lineage')
                except:
                    message = ('ERROR: {0} is not a supported classification '
                               'file.'.format(input_file))
                    shared.give_user_feedback(message,
                                              log_file,
                                              quiet,
                                              error=True)

                    sys.exit(1)
                    
                try:
                    scores_index = line.index('lineage scores')
                except:
                    scores_index = None

                full_length = len(line)

                break
        else:
            message = ('ERROR: {0} is not a supported classification file.'
                       ''.format(input_file))
            shared.give_user_feedback(message, log_file, quiet, error=True)

            sys.exit(1)
            
    with open(input_file, 'r') as f1, open(output_file, 'w') as outf1:
        for line in f1:
            line = line.rstrip()

            if line.startswith('#'):
                if only_official:
                    outf1.write('{0}\tsuperkingdom\tphylum\tclass\torder\t'
                                'family\tgenus\tspecies\n'.format(line))
                else:
                    outf1.write('{0}\tfull lineage names\n'.format(line))
                    
                continue
            
            line = line.split('\t')

            if len(line) != full_length:
                # Entry does not have a full annotation.
                outf1.write('{0}\n'.format('\t'.join(line)))

                continue

            if (line[1].startswith('no taxid found') or
                line[2].startswith('no taxid found')):
                # ORF has database hits but the accession number is not found
                # in the taxonomy files.
                outf1.write('{0}\n'.format('\t'.join(line)))

                continue
            
            lineage = line[lineage_index].split(';')

            if scores_index:
                scores = line[scores_index].split(';')
            else:
                scores = None

            if only_official:
                names = tax.convert_to_official_names(lineage,
                                                      taxid2rank,
                                                      taxid2name,
                                                      scores)
            else:
                names = tax.convert_to_names(lineage,
                                             taxid2rank,
                                             taxid2name,
                                             scores)

            outf1.write('{0}\t{1}\n'.format('\t'.join(line), '\t'.join(names)))

    message = 'Names written to {0}!'.format(output_file)
    shared.give_user_feedback(message, log_file, quiet)