def get_kraken_version(): """ Determine Kraken version. If kraken2 exists, it will be the default. Returns ------- version_kraken : int or None 1 if only first Kraken (or zero version) was found, 2 if kraken2 was found, or None if none of those was found """ version_kraken = None command = ['which', 'kraken'] run_successfully, _, _ = utils_run_command(command=command, shell_True=False, timeout_sec_None=None, print_comand_True=False) if run_successfully: version_kraken = 1 command[1] = 'kraken2' run_successfully, _, _ = utils_run_command(command=command, shell_True=False, timeout_sec_None=None, print_comand_True=False) if run_successfully: version_kraken = 2 return version_kraken
def get_statistics_samtools(alignment, outdir): """ Run Samtools stats to get several statistics from the alignment file Parameters ---------- alignment : str Path to the alignment file (can be SAM, BAM or CRAM) outdir : str Path to the output directory Returns ------- run_successfully : bool Boolean stating if INNUca Assembly_Mapping module ran successfully or not samtools_stats : str or None If everything went fine, it returns the path to the samtools stats file, otherwise it returns None """ samtools_stats = os.path.join(outdir, 'samtools_stats.txt') command = ['samtools', 'stats', alignment, '>', samtools_stats] run_successfully, _, _ = utils_run_command(command=command, shell_True=True, timeout_sec_None=None, print_comand_True=True) print('') if not run_successfully: samtools_stats = None return run_successfully, samtools_stats
def run_kraken_report(kraken_db, kraken_output, outdir): """ Get the Kraken report from kraken run Parameters ---------- kraken_db : str Kraken DB name or path to the directory containing the Kraken DB kraken_output : str Path to Kraken output file outdir : str Path to the output directory Returns ------- run_successfully : bool Boolean stating if Kraken ran successfully or not kraken_results : str String with Kraken report """ command = ['kraken-report', '--db', kraken_db, kraken_output] run_successfully, kraken_results, _ = utils_run_command( command=command, shell_True=False, timeout_sec_None=None, print_comand_True=True) if run_successfully: with open( os.path.join( outdir, 'kraken_report.{db}.txt'.format( db=os.path.basename(kraken_db))), 'wt') as writer: writer.write(kraken_results) return run_successfully, kraken_results
def mapping_bowtie2(fastq, reference_index, outdir, threads=1): """ Map reads against a reference fasta file Parameters ---------- fastq : list List of fastq files (only two, paired-end reads) reference_index : str Path to the reference Bowtie2 index outdir : str Path to the output directory threads : int, default 1 Number of threads to be used Returns ------- run_successfully : bool Boolean stating if INNUca Assembly_Mapping module ran successfully or not sam : str or None If everything went fine, it returns the path to the sam file, otherwise it returns None """ sam = os.path.join(outdir, str('alignment.sam')) command = [ 'bowtie2', '-q', '--very-fast', '--threads', str(threads), '-x', reference_index, '-1', fastq[0], '-2', fastq[1], '--fr', '-I', '0', '-X', '2000', '--no-discordant', '--no-mixed', '--no-unal', '-S', sam ] run_successfully, _, _ = utils_run_command(command=command, shell_True=False, timeout_sec_None=None, print_comand_True=True) print('') if not run_successfully: sam = None return run_successfully, sam
def index_sequence_bowtie2(reference, outdir, threads=1): """ Index reference sequence for Bowtie2 Parameters ---------- reference : str Path to the reference fasta file against which the reads will be mapped outdir : str Path to the output directory threads : int, default 1 Number of threads to be used Returns ------- run_successfully : bool Boolean stating if bowtie2-build ran successfully or not reference_index : str or None Path to the reference Bowtie2 index (if ran successfully, else returns None) """ command = [ 'bowtie2-build', '--threads', str(threads), reference, os.path.join(outdir, os.path.basename(reference)) ] run_successfully, _, _ = utils_run_command(command=command, shell_True=False, timeout_sec_None=None, print_comand_True=True) print('') reference_index = None if run_successfully: reference_index = os.path.join(outdir, os.path.basename(reference)) return run_successfully, reference_index
def run_kraken_main(files_to_classify, kraken_db, files_type, outdir, version_kraken, db_mem=False, quick=False, min_base_quality=10, threads=1): """ Run Kraken for data classification Parameters ---------- files_to_classify : list List with files to be classified by Kraken. Can be one fasta or up to two fastq files kraken_db : str Kraken DB name or path to the directory containing the Kraken DB files_type : str Type of the files to be classified: fasta or fastq outdir : str Path to the output directory version_kraken : int or None 1 if only first Kraken (or zero version) was found, 2 if kraken2 was found, or None if none of those was found db_mem : bool, default False True if want to load the Kraken DB into memory before run, else False quick : bool, default False True if want to do a quick operation and only use the first hits min_base_quality : int, default 10 Minimum base quality used in classification. Only used with fastq files and kraken2. threads : int, default 1 Number of threads to be used Returns ------- run_successfully : bool Boolean stating if Kraken ran successfully or not kraken_output : str or None Path to Kraken output file. If running kraken2, None is returned kraken_report : str or None Path to Kraken report (results) file in case of kraken2, else None """ files_type_options = ['fastq', 'fasta'] if files_type not in files_type_options: raise ValueError("Invalid files type. Expected one of: %s" % files_type_options) kraken_output = os.path.join( outdir, 'kraken.{db}.out'.format(db=os.path.basename(kraken_db))) kraken_report = None command = [ 'kraken', '', '', '--db', kraken_db, '--threads', str(threads), '--output', kraken_output, '', '--{type}-input'.format(type=files_type), '', '', '', '', '', '', '', ' '.join(files_to_classify) ] if version_kraken == 2: command[0] = 'kraken2' command[8] = '-' command[10] = '' kraken_output = None kraken_report = os.path.join( outdir, 'kraken_report.{db}.txt'.format(db=os.path.basename(kraken_db))) command[11] = '--report' command[12] = kraken_report """ Didn't get what this confidence mean --confidence FLOAT Confidence score threshold (default: 0.0); must be in [0, 1]. At present, we have not yet developed a confidence score with a probabilistic interpretation for Kraken 2. However, we have developed a simple scoring scheme that has yielded good results for us, and we've made that available in Kraken 2 through use of the --confidence option to kraken2. The approach we use allows a user to specify a threshold score in the [0,1] interval; the classifier then will adjust labels up the tree until the label's score (described below) meets or exceeds that threshold. If a label at the root of the taxonomic tree would not have a score exceeding the threshold, the sequence is called unclassified by Kraken 2 when this threshold is applied. A sequence label's score is a fraction C/Q, where C is the number of k-mers mapped to LCA values in the clade rooted at the label, and Q is the number of k-mers in the sequence that lack an ambiguous nucleotide (i.e., they were queried against the database). Consider the example of the LCA mappings in Kraken 2's output given earlier: "562:13 561:4 A:31 0:1 562:3" would indicate that: the first 13 k-mers mapped to taxonomy ID #562 the next 4 k-mers mapped to taxonomy ID #561 the next 31 k-mers contained an ambiguous nucleotide the next k-mer was not in the database the last 3 k-mers mapped to taxonomy ID #562 In this case, ID #561 is the parent node of #562. Here, a label of #562 for this sequence would have a score of C/Q = (13+3)/(13+4+1+3) = 16/21. A label of #561 would have a score of C/Q = (13+4+3)/(13+4+1+3) = 20/21. If a user specified a --confidence threshold over 16/21, the classifier would adjust the original label from #562 to #561; if the threshold was greater than 20/21, the sequence would become unclassified. """ # command[13] = '--confidence' # command[14] = '1' if files_type == 'fastq': command[15] = '--minimum-base-quality' command[16] = str(min_base_quality) if len(files_to_classify) == 0: sys.exit('No files provided for classification.') elif len(files_to_classify) <= 2: if files_type == 'fastq' and len(files_to_classify) == 2: command[17] = '--paired' elif files_type == 'fasta': if len(files_to_classify) == 2: sys.exit( '{n} files provided for classification. Maximum of 1 file for fasta is' ' allowed.'.format(n=len(files_to_classify))) elif len(files_to_classify) > 2: sys.exit( '{n} files provided for classification. Maximum of 2 files for fastq or 1 file for fasta are' ' allowed.'.format(n=len(files_to_classify))) compression_type = kraken_compression_type(files_to_classify[0]) if compression_type is not None: command[9] = '--{type}-compressed'.format(type=compression_type) del compression_type if quick: command[1] = '--quick' if db_mem and version_kraken == 1: command[2] = '--preload' elif not db_mem and version_kraken == 2: command[2] = '--memory-mapping' run_successfully, _, _ = utils_run_command(command=command, shell_True=False, timeout_sec_None=None, print_comand_True=True) return run_successfully, kraken_output, kraken_report