Exemple #1
0
    parser.add_argument('-y', action="store_true", dest='allyes',
                        help='assume "yes" to all questions')

    parser.add_argument('-f', action="store_true", dest='force',
                        help='forces download even if the files exist')

    parser.add_argument('-s', action="store_true", dest='simulate',
                        help='simulate and print commands. Nothing is downloaded')


    args = parser.parse_args()
    if 'all' in args.dbs:
        args.dbs = EGGNOG_DATABASES

    if args.force or not pexists(pjoin(DATA_PATH, 'eggnog.db')):
        if args.allyes or ask("Download main annotation database?") == 'y':
            print colorify('Downloading "eggnog.db" at %s...' %DATA_PATH, 'green')
            download_annotations()
        else:
            print 'Skipping'

    else:
        print colorify('Skipping eggnog.db database (already present). Use -f to force download', 'lblue')

    if args.force or not pexists(pjoin(DATA_PATH, 'OG_fasta')):
        if args.allyes or ask("Download OG fasta files for annotation refinement (~20GB after decompression)?") == 'y':
            print colorify('Downloading fasta files " at %s/OG_fasta...' %DATA_PATH, 'green')
            download_groups()
        else:
            print 'Skipping'
    args = parser.parse_args()

    if "EGGNOG_DATA_DIR" in os.environ:
        set_data_path(os.environ["EGGNOG_DATA_DIR"])

    if args.data_dir:
        set_data_path(args.data_dir)

    # if args.force or not pexists(pjoin(get_data_path(), 'og2level.tsv.gz')):
    #     print colorify('Downloading "og2level.tsv.gz" at %s' %get_data_path(), 'green')
    #     download_og2level()

    # if 'all' in args.dbs:
    #     args.dbs = EGGNOG_DATABASES

    if args.force or not pexists(pjoin(get_data_path(), 'eggnog.db')):
        if args.allyes or ask("Download main annotation database?") == 'y':
            print colorify(
                'Downloading "eggnog.db" at %s...' % get_data_path(), 'green')
            download_annotations()
        else:
            print 'Skipping'

    else:
        if not args.quiet:
            print colorify(
                'Skipping eggnog.db database (already present). Use -f to force download',
                'lblue')

    # if args.force or not pexists(pjoin(get_data_path(), 'OG_fasta')):
    #     if args.allyes or ask("Download OG fasta files for annotation refinement (~20GB after decompression)?") == 'y':
Exemple #3
0
            args.taxids != "") and (args.taxa is not None and args.taxa != ""):
        print(colorify(f'Use either --taxids or --taxa, not both', 'red'))
        sys.exit(1)

    ##

    if "EGGNOG_DATA_DIR" in os.environ:
        set_data_path(os.environ["EGGNOG_DATA_DIR"])

    if args.data_dir:
        set_data_path(args.data_dir)

    data_path = get_data_path()

    # http://eggnog5.embl.de/download/eggnog_5.0/e5.proteomes.faa
    if not pexists(get_eggnog_proteins_file()):
        if args.allyes or ask(
                f"Download eggnog5 proteins to {data_path}? ~9GB (It is required to create new databases)"
        ) == 'y':
            print(
                colorify(
                    f'Downloading eggnog5 proteins file to {data_path}...',
                    'green'))
            download_proteins(data_path)
        else:
            print(
                colorify(
                    f'eggnog5 proteins file was not found. Use --data_dir to specify another data path, or allow the download',
                    'red'))
            sys.exit(1)
    else:
    parser.add_argument('-y', action="store_true", dest='allyes',
                        help='assume "yes" to all questions')

    parser.add_argument('-f', action="store_true", dest='force',
                        help='forces download even if the files exist')

    parser.add_argument('-s', action="store_true", dest='simulate',
                        help='simulate and print commands. Nothing is downloaded')


    args = parser.parse_args()
    if 'all' in args.dbs:
        args.dbs = EGGNOG_DATABASES

    if args.force or not pexists(pjoin(DATA_PATH, 'eggnog.db')):
        if args.allyes or ask("Download main annotation database?") == 'y':
            print colorify('Downloading "eggnog.db" at %s...' %DATA_PATH, 'green')
            download_annotations()
        else:
            print 'Skipping'

    else:
        print colorify('Skipping eggnog.db database (already present). Use -f to force download', 'lblue')

    if args.force or not pexists(pjoin(DATA_PATH, 'OG_fasta')):
        if args.allyes or ask("Download OG fasta files for annotation refinement (~20GB after decompression)?") == 'y':
            print colorify('Downloading fasta files " at %s/OG_fasta...' %DATA_PATH, 'green')
            download_groups()
        else:
            print 'Skipping'
                        help='Directory to use for DATA_PATH.')

    args = parser.parse_args()

    if "EGGNOG_DATA_DIR" in os.environ:
        set_data_path(os.environ["EGGNOG_DATA_DIR"])

    if args.data_dir:
        set_data_path(args.data_dir)

    data_path = get_data_path()

    ##
    # Annotation DB
    
    if args.force or not pexists(get_eggnogdb_file()):
        if args.allyes or ask("Download main annotation database?") == 'y':
            print(colorify(f'Downloading "eggnog.db" at {data_path}...', 'green'))
            download_annotations(data_path)
        else:
            print('Skipping')
    else:
        if not args.quiet:
            print(colorify('Skipping eggnog.db database (already present). Use -f to force download', 'lblue'))

    ##
    # NCBI taxa
    
    if args.force or not pexists(get_ncbitaxadb_file()):
        if args.allyes or ask("Download taxa database?") == 'y':
            print(colorify(f'Downloading "eggnog.taxa.db" at {data_path}...', 'green'))
def parse_args(parser):
    
    args = parser.parse_args()

    if "EGGNOG_DATA_DIR" in os.environ:
        set_data_path(os.environ["EGGNOG_DATA_DIR"])
    
    if args.data_dir:
        set_data_path(args.data_dir)
        
    if args.version:
        version = ""
        try:
            version = get_full_version_info()
        except Exception:
            version = get_version()
        print(version)
        sys.exit(0)

    args.call_info = get_call_info()

    if args.list_taxa:
        from eggnogmapper.vars import LEVEL_DEPTH, LEVEL_DICT, LEVEL_NAMES, LEVEL_PARENTS
        print("tax_name\ttax_id\tdepth\tparents\tparents_names")
        for tax_name, tax_id in LEVEL_DICT.items():
            depth = LEVEL_DEPTH.get(tax_id, "-")
            parents = LEVEL_PARENTS.get(tax_id, "-")
            parents_names = [LEVEL_NAMES.get(x, "-") for x in parents]
            print(f"{tax_name}\t{tax_id}\t{depth}\t{','.join(parents)}\t{','.join(parents_names)}")
        sys.exit(0)

    if args.cpu == 0:
        args.cpu = multiprocessing.cpu_count()


    # translate
    if args.itype in [ITYPE_GENOME, ITYPE_META, ITYPE_PROTS] and args.translate == True:
        parser.error('"--translate" only can be used with "--itype CDS"')

    # Gene prediction
    if args.training_genome is not None and args.training_file is None:
        parser.error('"--training_genome requires --training_file"')

    if args.training_genome is None and args.training_file is not None:
        if not os.path.isfile(args.training_file):
            parser.error('"--training_file must point to an existing file, if no --training_genome is provided."')
    
    # Search modes
    if args.mode == SEARCH_MODE_DIAMOND:
        dmnd_db = args.dmnd_db if args.dmnd_db else get_eggnog_dmnd_db()
        if not pexists(dmnd_db):
            print(colorify('DIAMOND database %s not present. Use download_eggnog_database.py to fetch it' % dmnd_db, 'red'))
            raise EmapperException()

        if args.input is not None:
            if args.annotate_hits_table is not None:
                print(colorify(f"--annotate_hits_table will be ignored, due to -m {SEARCH_MODE_DIAMOND}", 'blue'))
                args.annotate_hits_table = None
        else:
            # the default -m is diamond, but we will consider -m no_search as default when
            # --annotate_hits_table has been provided and -i has not been provided
            if args.annotate_hits_table is not None:
                print(colorify(f"Assuming -m {SEARCH_MODE_NO_SEARCH}", 'blue'))
                args.mode = SEARCH_MODE_NO_SEARCH
            else:
                parser.error('An input fasta file is required (-i)')

        # Output file required
        if not args.output:
            parser.error('An output project name is required (-o)')

        if args.resume == True:
            print(colorify("Diamond jobs cannot be resumed. --resume will be ignored.", 'blue'))
            args.resume = False
            
    elif args.mode == SEARCH_MODE_MMSEQS2:
        mmseqs_db = args.mmseqs_db if args.mmseqs_db else get_eggnog_mmseqs_db()
        if not pexists(mmseqs_db):
            print(colorify('MMseqs2 database %s not present. Use download_eggnog_database.py to fetch it' % mmseqs_db, 'red'))
            raise EmapperException()

        if not args.input:
            parser.error('An input fasta file is required (-i)')

        # Output file required
        if not args.output:
            parser.error('An output project name is required (-o)')

        if args.resume == True:
            print(colorify("MMseqs2 jobs cannot be resumed. --resume will be ignored.", 'blue'))
            args.resume = False

        if args.annotate_hits_table is not None:
            print(colorify(f"--annotate_hits_table will be ignored, due to -m {SEARCH_MODE_MMSEQS2}", 'blue'))
            args.annotate_hits_table = None
            
    elif args.mode == SEARCH_MODE_HMMER:

        # if args.usemem == True:
        #     total_workers = args.num_workers * args.num_servers
        #     if args.cpu < total_workers:
        #         parser.error(f"Less cpus ({args.cpu}) than total workers ({total_workers}) were specified.")
        #     if args.cpu % total_workers != 0:
        #         parser.error(f"Number of cpus ({args.cpu}) must be a multiple of total workers ({total_workers}).")        

        #     args.cpus_per_worker = int(args.cpu / total_workers)
        #     sys.stderr.write(f"CPUs per worker: {args.cpus_per_worker}\n")
        # else:
        #     args.cpus_per_worker = args.cpu
        
        if not args.input:
            parser.error('An input file is required (-i)')

        # Output file required
        if not args.output:
            parser.error('An output project name is required (-o)')

        # Hmmer database
        # NOTE: hmmer database format, name and checking if exists is done within hmmer module
        if not args.db:
            parser.error('HMMER mode requires a target database (-d, --database).')

        if args.itype == ITYPE_CDS:
            args.translate = True

        if (args.itype == ITYPE_GENOME or args.itype == ITYPE_META) and args.genepred == GENEPRED_MODE_SEARCH:
            parser.error('HMMER mode is not compatible with "--genepred search" option.')            

        if args.annotate_hits_table is not None:
            print(colorify(f"--annotate_hits_table will be ignored, due to -m {SEARCH_MODE_HMMER}", 'blue'))
            args.annotate_hits_table = None

        if args.clean_overlaps is not None:
            if args.clean_overlaps == "none":
                args.clean_overlaps = None

    elif args.mode == SEARCH_MODE_CACHE:
        if args.cache_file is None:
            parser.error('A file with annotations and md5 of queries is required (-c FILE)')
        if args.decorate_gff != DECORATE_GFF_NONE:
            print(colorify("WARNING: no GFF will be created for cache-based annotations. It is not implemented yet, sorry.", 'red'))
                
        if args.no_annot == True:
            parser.error(f'Cache mode (-m {SEARCH_MODE_CACHE}) should be used to annotate.')
            
    elif args.mode == SEARCH_MODE_NO_SEARCH:
        if args.no_annot == False and not args.annotate_hits_table:
            parser.error(f'No search mode (-m {SEARCH_MODE_NO_SEARCH}) requires a hits table to annotate (--annotate_hits_table FILE.seed_orthologs)')
        if args.md5 == True and args.input is None:
            parser.error(f'--md5 requires an input FASTA file (-i FASTA).')            
        # if args.no_annot == True and args.report_orthologs == False:
        #     parser.error(f'Nothing to do if running in no search mode (-m {SEARCH_MODE_NO_SEARCH}), with --no_annot and without --report_orthologs.')
            
    else:
        parser.error(f'unrecognized search mode (-m {args.mode})')


    # Search thresholds
    args.dmnd_evalue = args.mmseqs_evalue = args.hmm_evalue = args.evalue
    args.dmnd_score = args.mmseqs_score = args_hmm_score = args.score
    args.qcov = args.query_cover
    
    # Annotation options
    if args.no_annot == False or args.report_orthologs == True:
        if not pexists(get_eggnogdb_file()):
            print(colorify('Annotation database data/eggnog.db not present. Use download_eggnog_database.py to fetch it', 'red'))
            raise EmapperException()

        args.tax_scope_mode, args.tax_scope_id = __parse_tax_scope(args.tax_scope)
        if args.target_taxa is not None:
            args.target_taxa = args.target_taxa.split(",")
        if args.excluded_taxa is not None:
            args.excluded_taxa = args.excluded_taxa.split(",")
        
    # Sets GO evidence bases
    if args.go_evidence == 'experimental':
        args.go_evidence = set(["EXP","IDA","IPI","IMP","IGI","IEP"])
        args.go_excluded = set(["ND", "IEA"])

    elif args.go_evidence == 'non-electronic':
        args.go_evidence = None
        args.go_excluded = set(["ND", "IEA"])

    elif args.go_evidence == 'all':
        args.go_evidence = None
        args.go_excluded = None
        
    else:
        raise ValueError('Invalid --go_evidence value')

    # PFAM annotation options
    if args.pfam_transfer in [PFAM_TRANSFER_BEST_OG, PFAM_TRANSFER_NARROWEST_OG, PFAM_TRANSFER_SEED_ORTHOLOG]:
        pass
    else:
        raise ValueError(f'Invalid --pfam_transfer option {args.pfam_transfer}')
    
    if args.pfam_realign == PFAM_REALIGN_NONE:
        pass
    elif args.pfam_realign == PFAM_REALIGN_REALIGN or args.pfam_realign == PFAM_REALIGN_DENOVO:
        if not args.input:
            parser.error(f'An input fasta file is required (-i) for --pfam_realign {args.pfam_realign}')
    else:
        raise ValueError(f'Invalid --pfam_realign option {args.pfam_realign}')

    total_workers = args.num_workers * args.num_servers
    if args.cpu < total_workers:
        parser.error(f"Less cpus ({args.cpu}) than total workers ({total_workers}) were specified.")
    if args.cpu % total_workers != 0:
        parser.error(f"Number of cpus ({args.cpu}) must be a multiple of total workers ({total_workers}).")        

    args.cpus_per_worker = int(args.cpu / total_workers)
    
    return args
    parser.add_argument('-q',
                        action="store_true",
                        dest='quiet',
                        help='quiet_mode')

    parser.add_argument("--data_dir",
                        metavar='',
                        type=existing_dir,
                        help='Directory to use for DATA_PATH.')

    args = parser.parse_args()

    if args.data_dir:
        set_data_path(args.data_dir)

    if args.force or not pexists(pjoin(get_data_path(), 'og2level.tsv.gz')):
        print colorify('Downloading "og2level.tsv.gz" at %s' % get_data_path(),
                       'green')
        download_og2level()

    if 'all' in args.dbs:
        args.dbs = EGGNOG_DATABASES

    if args.force or not pexists(pjoin(get_data_path(), 'eggnog.db')):
        if args.allyes or ask("Download main annotation database?") == 'y':
            print colorify(
                'Downloading "eggnog.db" at %s...' % get_data_path(), 'green')
            download_annotations()
        else:
            print 'Skipping'
Exemple #8
0
def parse_args(parser):
    
    args = parser.parse_args()

    if "EGGNOG_DATA_DIR" in os.environ:
        set_data_path(os.environ["EGGNOG_DATA_DIR"])
    
    if args.data_dir:
        set_data_path(args.data_dir)
        
    if args.version:
        version = ""
        try:
            version = get_full_version_info()
        except Exception:
            version = get_version()
        print(version)
        sys.exit(0)

    args.call_info = get_call_info()

    if args.list_taxa:
        print_taxa()
        sys.exit(0)

    if args.cpu == 0:
        args.cpu = multiprocessing.cpu_count()
    multiprocessing.set_start_method(args.mp_start_method)

    if args.resume == True and args.override == True:
        parser.error('Only one of --resume or --override is allowed.')        

    # Gene prediction
    if args.training_genome is not None and args.training_file is None:
        parser.error('"--training_genome requires --training_file"')

    if args.training_genome is None and args.training_file is not None:
        if not os.path.isfile(args.training_file):
            parser.error('"--training_file must point to an existing file, if no --training_genome is provided."')
    
    # Search modes
    if args.mode == SEARCH_MODE_DIAMOND:
        dmnd_db = args.dmnd_db if args.dmnd_db else get_eggnog_dmnd_db()
        if not pexists(dmnd_db):
            print(colorify('DIAMOND database %s not present. Use download_eggnog_database.py to fetch it' % dmnd_db, 'red'))
            raise EmapperException()

        if args.input is not None:
            if args.annotate_hits_table is not None:
                print(colorify(f"--annotate_hits_table will be ignored, due to -m {SEARCH_MODE_DIAMOND}", 'blue'))
                args.annotate_hits_table = None
        else:
            # the default -m is diamond, but we will consider -m no_search as default when
            # --annotate_hits_table has been provided and -i has not been provided
            if args.annotate_hits_table is not None:
                print(colorify(f"Assuming -m {SEARCH_MODE_NO_SEARCH}", 'blue'))
                args.mode = SEARCH_MODE_NO_SEARCH
            else:
                parser.error('An input fasta file is required (-i)')

        # Output file required
        if not args.output:
            parser.error('An output project name is required (-o)')
            
    elif args.mode == SEARCH_MODE_MMSEQS2:
        mmseqs_db = args.mmseqs_db if args.mmseqs_db else get_eggnog_mmseqs_db()
        if not pexists(mmseqs_db):
            print(colorify('MMseqs2 database %s not present. Use download_eggnog_database.py to fetch it' % mmseqs_db, 'red'))
            raise EmapperException()

        if not args.input:
            parser.error('An input fasta file is required (-i)')

        # Output file required
        if not args.output:
            parser.error('An output project name is required (-o)')

        if args.annotate_hits_table is not None:
            print(colorify(f"--annotate_hits_table will be ignored, due to -m {SEARCH_MODE_MMSEQS2}", 'blue'))
            args.annotate_hits_table = None
            
    elif args.mode == SEARCH_MODE_HMMER:
        
        if not args.input:
            parser.error('An input file is required (-i)')

        # Output file required
        if not args.output:
            parser.error('An output project name is required (-o)')

        # Hmmer database
        # NOTE: hmmer database format, name and checking if exists is done within hmmer module
        if not args.db:
            parser.error('HMMER mode requires a target database (-d, --database).')

        if args.itype == ITYPE_CDS:
            args.translate = True

        if (args.itype == ITYPE_GENOME or args.itype == ITYPE_META) and args.genepred == GENEPRED_MODE_SEARCH:
            parser.error('HMMER mode is not compatible with "--genepred search" option.')            

        if args.annotate_hits_table is not None:
            print(colorify(f"--annotate_hits_table will be ignored, due to -m {SEARCH_MODE_HMMER}", 'blue'))
            args.annotate_hits_table = None

        if args.clean_overlaps is not None:
            if args.clean_overlaps == "none":
                args.clean_overlaps = None

    elif args.mode == SEARCH_MODE_CACHE:
        if args.cache_file is None:
            parser.error('A file with annotations and md5 of queries is required (-c FILE)')
        if args.decorate_gff != DECORATE_GFF_NONE:
            print(colorify("WARNING: no GFF will be created for cache-based annotations. It is not implemented yet, sorry.", 'red'))
                
        if args.no_annot == True:
            parser.error(f'Cache mode (-m {SEARCH_MODE_CACHE}) should be used to annotate.')
            
    elif args.mode == SEARCH_MODE_NO_SEARCH:
        if args.no_annot == False and not args.annotate_hits_table:
            parser.error(f'No search mode (-m {SEARCH_MODE_NO_SEARCH}) requires a hits table to annotate (--annotate_hits_table FILE.seed_orthologs)')
        if args.md5 == True and args.input is None:
            parser.error(f'--md5 requires an input FASTA file (-i FASTA).')            
            
    else:
        parser.error(f'unrecognized search mode (-m {args.mode})')


    # Search thresholds
    args.dmnd_evalue = args.mmseqs_evalue = args.hmm_evalue = args.evalue
    args.dmnd_score = args.mmseqs_score = args_hmm_score = args.score
    args.qcov = args.query_cover
    
    # Annotation options
    if args.no_annot == False or args.report_orthologs == True:
        if not pexists(get_eggnogdb_file()):
            print(colorify('Annotation database data/eggnog.db not present. Use download_eggnog_database.py to fetch it', 'red'))
            raise EmapperException()

        args.tax_scope_ids = parse_tax_scope(args.tax_scope)
        
        if args.target_taxa is not None:
            args.target_taxa = args.target_taxa.split(",")
        if args.excluded_taxa is not None:
            args.excluded_taxa = args.excluded_taxa.split(",")
        
    # Sets GO evidence bases
    if args.go_evidence == 'experimental':
        args.go_evidence = set(["EXP","IDA","IPI","IMP","IGI","IEP"])
        args.go_excluded = set(["ND", "IEA"])

    elif args.go_evidence == 'non-electronic':
        args.go_evidence = None
        args.go_excluded = set(["ND", "IEA"])

    elif args.go_evidence == 'all':
        args.go_evidence = None
        args.go_excluded = None
        
    else:
        raise ValueError('Invalid --go_evidence value')

    # PFAM annotation options
    
    if args.pfam_realign == PFAM_REALIGN_NONE:
        pass
    elif args.pfam_realign == PFAM_REALIGN_REALIGN or args.pfam_realign == PFAM_REALIGN_DENOVO:
        if not args.input:
            parser.error(f'An input fasta file is required (-i) for --pfam_realign {args.pfam_realign}')
    else:
        raise ValueError(f'Invalid --pfam_realign option {args.pfam_realign}')

    total_workers = args.num_workers * args.num_servers
    if args.cpu < total_workers:
        parser.error(f"Less cpus ({args.cpu}) than total workers ({total_workers}) were specified.")
    if args.cpu % total_workers != 0:
        parser.error(f"Number of cpus ({args.cpu}) must be a multiple of total workers ({total_workers}).")        

    args.cpus_per_worker = int(args.cpu / total_workers)
    
    return args