Ejemplo n.º 1
0
def train_model(working_dir, viral_ftrfile, nonviral_ftrfile, balanced, jobs,
                use_conda_off, snakemake_args):
    '''Training customized classifier model.
    '''

    DEFAULT_CONFIG = get_default_config()

    if balanced == None:
        balanced = False
    cmd = ('snakemake --snakefile {snakefile} '
           '--directory {working_dir} '
           '--config '
           'Viral_ftrfile={viral_ftrfile} '
           'Nonviral_ftrfile={nonviral_ftrfile} '
           'Balanced={balanced} '
           'Jobs={jobs} '
           '--jobs {jobs} --rerun-incomplete --latency-wait 600 '
           '--nolock --quiet {use_conda_off} {conda_prefix} '
           '{add_args} {args}').format(
               snakefile=get_snakefile('rules/train-model.smk'),
               working_dir=working_dir,
               viral_ftrfile=viral_ftrfile,
               nonviral_ftrfile=nonviral_ftrfile,
               balanced=balanced,
               jobs=jobs,
               use_conda_off='' if use_conda_off else '--use-conda',
               conda_prefix=''
               if use_conda_off else '--conda-prefix {}'.format(
                   os.path.join(DEFAULT_CONFIG['DBDIR'], 'conda_envs')),
               add_args=('' if snakemake_args
                         and snakemake_args[0].startswith('-') else '--'),
               args=' '.join(snakemake_args),
           )
    logging.info('Executing: %s' % cmd)
    try:
        subprocess.run(cmd, check=True, shell=True)
    except subprocess.CalledProcessError as e:
        # removes the traceback
        #logging.critical(e)
        exit(1)
Ejemplo n.º 2
0
import os
import screed
import numpy as np
import pandas as pd
import click

from ruamel.yaml import YAML


script_dir = os.path.dirname(os.path.abspath(__file__))
snakefile_dir = os.path.dirname(script_dir)
pkg_dir = os.path.dirname(snakefile_dir)
sys.path.append(pkg_dir)
from virsorter.config import get_default_config, set_logger

DEFAULT_CONFIG = get_default_config()
D = DEFAULT_CONFIG['GROUP_INFO']
DEFAULT_MIN_SIZE_ALLOWED_WO_HALLMARK_GENE = \
        DEFAULT_CONFIG['DEFAULT_MIN_SIZE_ALLOWED_WO_HALLMARK_GENE']

CONTEXT_SETTINGS = {'help_option_names': ['-h', '--help']}
@click.command(context_settings=CONTEXT_SETTINGS)
@click.option('--hallmark-required', is_flag=True, default=False,
        help='require hallmark gene')
@click.option('--hallmark-required-on-short', is_flag=True, default=False,
        help='require hallmark gene on short seqs')
@click.option('--viral-gene-required', is_flag=True, default=False,
        help='require viral gene')
@click.argument('config', type=click.Path())
@click.argument('intable', type=click.Path())
@click.argument('inseqfile', type=click.Path())
Ejemplo n.º 3
0
def config(show, show_source, init_source, db_dir, set, get):
    '''CLI for managing configurations.

    There are many configurations kept in "template-config.yaml" in source 
    code directory or "~/.virsorter" (when source code directory is not 
    writable for user). This file can located with 
    `virsorter config --show-source`. You can set the configurations with 
    `virsorter config --set KEY=VAL`. Alternative, you can edit in the 
    configuration file ("template-config.yaml") directly.
    '''

    from virsorter.config import (TEMPLATE, SRC_CONFIG_DIR, USER_CONFIG_DIR,
                                  init_config_template)

    if init_source:
        if db_dir == None:
            mes = '--db-dir is required for --init-source'
            logging.critical(mes)
            sys.exit(1)
        else:
            if not os.path.isdir(db_dir):
                mes = (f'--db-dir {db_dir} does NOT exist yet; Make sure it '
                       'is created later\n')
                logging.warning(mes)

            db_dir = os.path.abspath(db_dir)
            init_config_template(SRC_CONFIG_DIR, USER_CONFIG_DIR, db_dir)
            sys.exit(0)

    if not os.path.isfile(TEMPLATE):
        mes = ('config file "template-config.yaml" has not been '
               'initialized yet; Please use '
               '`virsorter config --init-source --db-dir PATH` to initialize')
        logging.critical(mes)
        sys.exit(1)

    config = get_default_config()

    if show:
        YAML().dump(config, sys.stdout)
        sys.exit(0)

    if show_source:
        mes = f'config file path: {TEMPLATE}\n'
        sys.stdout.write(mes)
        sys.exit(0)

    if get != None:
        s = get
        lis = [var.strip() for var in s.split(',')]
        for var in lis:
            temp = config
            for i in var.split('.'):
                i = i.strip()
                try:
                    temp = temp[i]
                except KeyError as e:
                    mes = f'{i} is not a key in config file ({TEMPLATE})'
                    logging.critical(mes)
                    sys.exit(1)

            mes = f'{var}: {temp}\n'
            sys.stdout.write(mes)

        sys.exit(0)

    if set != None:
        s = set
        lis = [item.strip() for item in s.split(',')]
        for item in lis:
            temp = config
            var, val = item.split('=')
            var = var.strip()
            val = val.strip()
            keys = [key.strip() for key in var.split('.')]
            for i in range(len(keys)):
                if i == (len(keys) - 1):
                    # stop at 2nd last key
                    break
                key = keys[i]
                try:
                    temp = temp[key]
                except KeyError as e:
                    mes = f'{key} is not a key in config file ({TEMPLATE})'
                    logging.critical(mes)
                    sys.exit(1)

            last_key = keys[-1]

            try:
                old_val = temp[last_key]
                if isinstance(old_val, int):
                    try:
                        val = int(val)
                    except ValueError as e:
                        mes = f'{var} is supposed to be an integer'
                        logging.critical(mes)
                        sys.exit(1)
                elif isinstance(old_val, float):
                    val = float(val)
                    try:
                        val = float(val)
                    except ValueError as e:
                        mes = f'{var} is supposed to be a float'
                        logging.critical(mes)
                        sys.exit(1)
                # only convert to abspath when the old one exists
                #   since sometimes just want to set relative path
                elif os.path.exists(old_val):
                    val = os.path.abspath(val)
                temp[last_key] = val
            except KeyError as e:
                mes = f'{last_key} is not a key in config file ({TEMPLATE})'
                logging.critical(mes)
                sys.exit(1)

            mes = f'{var}: {old_val} ==> {val}\n'
            sys.stdout.write(mes)
            with open(TEMPLATE, 'w') as fw:
                YAML().dump(config, fw)

        sys.exit(0)
Ejemplo n.º 4
0
def train_feature(working_dir, seqfile, hmm, hallmark, prodigal_train,
                  frags_per_genome, min_length, max_orf_per_seq, genome_as_bin,
                  jobs, use_conda_off, snakemake_args):
    '''Training features for customized classifier.
    
    Executes a snakemake workflow to do the following:
    1) prepare random DNA fragments from viral and nonviral genome data 
    2) extract feature from random DNA fragments to make ftrfile
    '''

    DEFAULT_CONFIG = get_default_config()

    cwd = os.getcwd()
    lis = []
    pat_lis = []
    for pat in seqfile:
        # only works in linux
        if pat.startswith('/'):
            new_pat = pat
        else:
            new_pat = '{}/{}'.format(cwd, pat)
        fs = glob.glob(pat)
        lis.extend(fs)
        pat_lis.append(new_pat)

    if len(lis) == 0:
        mes = 'No files match {}'.format(viral_seqfile)
        logging.critical(mes)
        sys.exit(1)
    else:
        mes = '{} seqfiles are used for training features'.format(len(lis))
        logging.info(mes)

    if hmm == None:
        hmm = 'NA'
    if hallmark == None:
        hallmark = 'NA'

    if prodigal_train == None:
        prodigal_train = 'NA'

    cmd = ('snakemake --snakefile {snakefile} '
           '--directory {working_dir} '
           '--config Viral_seqfile="{seqfile}" '
           'Hmm={hmm} '
           'Hallmark={hallmark} '
           'Rbs={prodigal_train} '
           'Min_length={min_length} '
           'Max_orf_per_seq={max_orf_per_seq} '
           'Viral_genome_as_bin={genome_as_bin} '
           'Fragments_per_genome={frags_per_genome} '
           '--jobs {jobs} --rerun-incomplete --latency-wait 600 '
           '--nolock --quiet {use_conda_off} {conda_prefix} '
           '{add_args} {args}').format(
               snakefile=get_snakefile('rules/train-feature.smk'),
               working_dir=working_dir,
               seqfile=' '.join(pat_lis),
               hmm=hmm,
               hallmark=hallmark,
               prodigal_train=prodigal_train,
               min_length=min_length,
               max_orf_per_seq=max_orf_per_seq,
               genome_as_bin=genome_as_bin,
               frags_per_genome=frags_per_genome,
               jobs=jobs,
               use_conda_off='' if use_conda_off else '--use-conda',
               conda_prefix=''
               if use_conda_off else '--conda-prefix {}'.format(
                   os.path.join(DEFAULT_CONFIG['DBDIR'], 'conda_envs')),
               add_args=('' if snakemake_args
                         and snakemake_args[0].startswith('-') else '--'),
               args=' '.join(snakemake_args),
           )
    logging.info('Executing: %s' % cmd)
    try:
        subprocess.run(cmd, check=True, shell=True)
    except subprocess.CalledProcessError as e:
        # removes the traceback
        #logging.critical(e)
        exit(1)
Ejemplo n.º 5
0
def config(show, show_source, init_source, db_dir, set, get):
    '''CLI for managing configurations.

    There are many configurations kept in "template-config.yaml" in source 
    code directory or "~/.virsorter" (when source code directory is not 
    writable for user). This file can located with 
    `virsorter config --show-source`. You can set the configurations with 
    `virsorter config --set KEY=VAL`. Alternative, you can edit in the 
    configuration file ("template-config.yaml") directly.
    '''

    from virsorter.config import (TEMPLATE, SRC_CONFIG_DIR, USER_CONFIG_DIR,
                                  init_config_template)

    if init_source:
        if db_dir == None:
            mes = '--db-dir is required for --init-source'
            logging.critical(mes)
            sys.exit(1)
        else:
            init_config_template(SRC_CONFIG_DIR, USER_CONFIG_DIR, db_dir)
            sys.exit(0)

    if not os.path.isfile(TEMPLATE):
        mes = ('config file "template-config.yaml" has not been '
               'initialized yet; Please use '
               '`virsorter config --init-source --db-dir PATH` to initialize')
        logging.critical(mes)
        sys.exit(1)

    config = get_default_config()

    if show:
        YAML().dump(config, sys.stdout)
        sys.exit(0)

    if show_source:
        mes = f'config file path: {TEMPLATE}\n'
        sys.stdout.write(mes)
        sys.exit(0)

    if get != None:
        s = get
        lis = [var.strip() for var in s.split(',')]
        for var in lis:
            temp = config
            for i in var.split('.'):
                i = i.strip()
                try:
                    temp = temp[i]
                except KeyError as e:
                    mes = f'{i} is not a key in config file ({TEMPLATE})'
                    logging.critical(mes)
                    sys.exit(1)

            mes = f'{var}: {temp}\n'
            sys.stdout.write(mes)

        sys.exit(0)

    if set != None:
        s = set
        lis = [item.strip() for item in s.split(',')]
        for item in lis:
            temp = config
            var, val = item.split('=')
            var = var.strip()
            val = val.strip()
            keys = [key.strip() for key in var.split('.')]
            for i in range(len(keys)):
                if i == (len(keys) - 1):
                    # stop at 2nd last key
                    break
                key = keys[i]
                try:
                    temp = temp[key]
                except KeyError as e:
                    mes = f'{key} is not a key in config file ({TEMPLATE})'
                    logging.critical(mes)
                    sys.exit(1)

            last_key = keys[-1]
            try:
                old_val = temp[last_key]
                temp[last_key] = val
            except KeyError as e:
                mes = f'{last_key} is not a key in config file ({TEMPLATE})'
                logging.critical(mes)
                sys.exit(1)

            mes = f'{var}: {old_val} ==> {val}\n'
            sys.stdout.write(mes)
            with open(TEMPLATE, 'w') as fw:
                YAML().dump(config, fw)

        sys.exit(0)