def run_makedb(fasta, db=None, **kwargs): '''Format database from a fasta file. This is similar to running ``diamond makedb --in db.faa --db db``. Parameters ---------- fasta : str Input path for the fasta file. db : str or None (default) Output path for the formatted database file. It will be named after input file in the same directory by default. kwargs : dict keyword arguments. Other command line parameters for diamond makedb. Returns ------- `Dumpling` ''' logger = getLogger(__name__) if db is None: db = splitext(fasta)[0] makedb = Dumpling(['diamond', 'makedb'], params=Parameters(*makedb_params), version='0.7.12', url='https://github.com/bbuchfink/diamond') makedb.update(fasta=fasta, db=db, **kwargs) logger.info('Running {}'.format(makedb.command)) makedb() return makedb
def run(db, query, out_dir, **kwargs): '''Scan a fasta file against a covariance model database. Parameters ---------- db : str The file path to HMM database. query : str Input fasta file. out_dir : str dir to store output file path of target hits table. kwargs : dict Other command line parameters for hmmscan. key is the option (e.g. "-T") and value is the value for the option (e.g. "50"). If the option is a flag, set the value to None. Returns ------- `Dumpling` ''' logger = getLogger(__name__) prefix = splitext(basename(query))[0] out = join(out_dir, '{}.tblout'.format(prefix)) hmmscan = Dumpling('hmmscan', params=Parameters(*_scan_params)) hmmscan.update(query=query, db=db, out=out, **kwargs) logger.info('Running {}'.format(hmmscan.command)) hmmscan() return hmmscan
def run(db, query, out_dir, **kwargs): '''Scan a fasta file against a covariance model database. Parameters ---------- db : str The file path to CM database. query : str Input fasta file. out_dir : str dir to store output file path of target hits table. kwargs : dict keyword arguments. command line parameters for cmscan. Returns ------- `Dumpling` ''' logger = getLogger(__name__) prefix = splitext(basename(query))[0] out = join(out_dir, '{}.tblout'.format(prefix)) cmscan = Dumpling('cmscan', params=Parameters(*cmscan_params)) cmscan.update(query=query, db=db, out=out, **kwargs) logger.info('Running {}'.format(cmscan.command)) cmscan() return cmscan
def run_blast(query, daa, aligner='blastp', **kwargs): '''Search query sequences against the database. Parameters ---------- query : str The file path of the query seq daa : str The file path of the output daa file aligner : str The aligner. blastp or blastx kwargs : dict keyword arguments. Command line parameters for diamond blastp or blastx. Returns ------- str The file path of the blast result. ''' logger = getLogger(__name__) blast = Dumpling(['diamond', aligner], params=Parameters(*blast_params)) blast.update(query=query, daa=daa, **kwargs) logger.info('Running {}'.format(blast.command)) blast() return blast
def run(query, out_dir, cpus=1, **kwargs): """Run prodigal for gene prediction. Notes ----- It will create 3 output files with prefix of "prodigal" in "out_dir" folder: 1. the annotation file (GFF3 file by default) 2. the nucleotide sequences for each predicted gene with file suffix of .fna. 3. the protein sequence translated from each gene with file suffix of .faa. Parameters ---------- query : str input file path of sequence out_dir : str output dir cpus : int Prodigal does not have a param to set up how many CPU cores. This is a fake parameter to make it conform to the same API with other apps for the sake of convenience. kwargs : dict keyword arguments for Prodigal. Returns ------- `Dumpling` """ logger = getLogger(__name__) makedirs(out_dir, exist_ok=True) prodigal = Dumpling( "prodigal", params=Parameters(*params), version="v2.6.3", url="https://github.com/hyattpd/Prodigal" ) # set default output to gff and run mode to draft genome prodigal.update(query=query, fmt="gff", mode="single") # update with kwargs prodigal.update(**kwargs) suffices = { "-a": "faa", # output file of nucleotide sequences of genes "-d": "fna", "-o": prodigal.params["fmt"].value, } for flag in ["-a", "-d", "-o"]: p = prodigal.params[flag] if p.is_off(): p.on(join(out_dir, "prodigal.{}".format(suffices[flag]))) logger.info("Running CDS prediction {}".format(prodigal.command)) prodigal(stdout=join(out_dir, "prodigal.log")) return prodigal
def run(query, out_dir, cpus=1, gff=True, **kwargs): '''Predict CRISPRs for the input file. Notes ----- It will create 1 or 2 output files, depending on the parameters: 1. file containing CRIPSR information, including locations of CRISPRs and their sequence composition OR GFF file with short information on CRISPR locations OR GFFFull file with detailed information on CRISPR locations 2. (OPTIONAL; -spacers flag) Fasta file of predicted CRISPR spacers Parameters ---------- query : str input file path of nucleotide sequence out_dir : str output dir cpus : int Minced does not have a param to set up how many CPU cores. This is a fake parameter to make it conform to the same API with other apps for the sake of convenience. gff : bool output in gff3 format kwargs : dict keyword arguments. Other command line parameters for MinCED. key is the option (e.g. "-searchWL") and value is the value for the option (e.g. "6"). If the option is a flag, set the value to None. Returns ------- ''' logger = getLogger(__name__) minced = Dumpling('minced', params=Parameters(*params), version='0.2.0', url='https://github.com/ctSkennerton/minced') prefix = splitext(basename(query))[0] out = join(out_dir, '{}.gff'.format(prefix)) minced.update(query=query, out=out, gff=gff, **kwargs) logger.info('Running {}'.format(minced.command)) p = minced() if not exists(out): # minced raises Java error but return 0 exit code if the input file is invalid, # so raise the exception for minced manually if no output is created. p.returncode = 1 raise CalledProcessError( p.returncode, cmd=repr(p.args)) return minced
def run(query, out_dir, cpus=1, gff=True, **kwargs): """Predict CRISPRs for the input file. Notes ----- It will create 1 or 2 output files, depending on the parameters: 1. file containing CRIPSR information, including locations of CRISPRs and their sequence composition OR GFF file with short information on CRISPR locations OR GFFFull file with detailed information on CRISPR locations 2. (OPTIONAL; -spacers flag) Fasta file of predicted CRISPR spacers Parameters ---------- query : str input file path of nucleotide sequence out_dir : str output dir cpus : int Minced does not have a param to set up how many CPU cores. This is a fake parameter to make it conform to the same API with other apps for the sake of convenience. gff : bool output in gff3 format kwargs : dict keyword arguments. Other command line parameters for MinCED. key is the option (e.g. "-searchWL") and value is the value for the option (e.g. "6"). If the option is a flag, set the value to None. Returns ------- """ logger = getLogger(__name__) minced = Dumpling( "minced", params=Parameters(*params), version="0.2.0", url="https://github.com/ctSkennerton/minced" ) prefix = splitext(basename(query))[0] out = join(out_dir, "{}.gff".format(prefix)) minced.update(query=query, out=out, gff=gff, **kwargs) logger.info("Running {}".format(minced.command)) p = minced() if not exists(out): # minced raises Java error but return 0 exit code if the input file is invalid, # so raise the exception for minced manually if no output is created. p.returncode = 1 raise CalledProcessError(p.returncode, cmd=repr(p.args)) return minced
def run_hmmpress(hmm, force=False): '''Compress the HMM database. Parameters ---------- hmm : str The file path to HMM database. force : boolean Whether to overwrite.''' params = [ OptionParam('-f', name='force', help='force overwrite'), ArgmntParam(name='hmm', help='hmm file to press')] hmmpress = Dumpling('hmmpress', params=Parameters(*params)) hmmpress.update(hmm=hmm, force=force) hmmpress() return hmmpress
def run_cmpress(cm, force=False): '''Compress the CM database. Parameters ---------- cm : str The file path to CM database. force : boolean Whether to overwrite.''' params = [ OptionParam('-F', name='force', help='force overwrite'), ArgmntParam(name='cm', help='cm file to press')] cmpress = Dumpling('cmpress', params=Parameters(*params)) cmpress.update(cm=cm, force=force) cmpress() return cmpress
def run_view(daa, out, fmt='sam', **kwargs): '''Convert Diamond daa file to a human readable output. Parameters ---------- daa : str Input file resulting from diamond blast. out : str Output file. ''' logger = getLogger(__name__) view = Dumpling(['diamond', 'view'], params=Parameters(*view_params)) view.update(daa=daa, out=out, fmt=fmt, **kwargs) logger.info('Running {}'.format(view.command)) view() return view
def run_hmmpress(hmm, force=False): '''Compress the HMM database. Parameters ---------- hmm : str The file path to HMM database. force : boolean Whether to overwrite.''' params = [ OptionParam('-f', name='force', help='force overwrite'), ArgmntParam(name='hmm', help='hmm file to press') ] hmmpress = Dumpling('hmmpress', params=Parameters(*params)) hmmpress.update(hmm=hmm, force=force) hmmpress() return hmmpress
def run(query, out_dir, cpus=1, **kwargs): '''Run prodigal for gene prediction. Notes ----- It will create 3 output files with prefix of "prodigal" in "out_dir" folder: 1. the annotation file (GFF3 file by default) 2. the nucleotide sequences for each predicted gene with file suffix of .fna. 3. the protein sequence translated from each gene with file suffix of .faa. Parameters ---------- query : str input file path of sequence out_dir : str output dir cpus : int Prodigal does not have a param to set up how many CPU cores. This is a fake parameter to make it conform to the same API with other apps for the sake of convenience. kwargs : dict keyword arguments for Prodigal. Returns ------- `Dumpling` ''' logger = getLogger(__name__) makedirs(out_dir, exist_ok=True) prodigal = Dumpling('prodigal', params=Parameters(*params), version='v2.6.3', url='https://github.com/hyattpd/Prodigal') # set default output to gff and run mode to draft genome prodigal.update(query=query, fmt='gff', mode='single') # update with kwargs prodigal.update(**kwargs) suffices = { '-a': 'faa', # output file of nucleotide sequences of genes '-d': 'fna', '-o': prodigal.params['fmt'].value } for flag in ['-a', '-d', '-o']: p = prodigal.params[flag] if p.is_off(): p.on(join(out_dir, 'prodigal.{}'.format(suffices[flag]))) logger.info('Running CDS prediction {}'.format(prodigal.command)) prodigal(stdout=join(out_dir, 'prodigal.log')) return prodigal
def scan_file(query, db, cpu=1, **kwargs): cmscan = Dumpling('cmscan', params=Parameters(*_params)) cmscan.update(query=query, db=db, **kwargs) return cmscan()
def scan_seq(seq, db, cpu=1, **kwargs): cmscan = Dumpling('cmscan', params=Parameters(*_params)) with NamedTemporaryFile(mode='w+') as i: write(seq, into=i.name, format='fasta') cmscan.update(query=i.name, db=db, **kwargs) return cmscan()