def indexSequenceBowtie2(referenceFile, threads): if os.path.isfile(str(referenceFile + '.1.bt2')): run_successfully = True else: command = ['bowtie2-build', '--threads', str(threads), referenceFile, referenceFile] run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True) return run_successfully
def run_bowtie(fastq_files, referenceFile, threads, outdir, conserved_True, numMapLoc): sam_file = os.path.join(outdir, str('alignment.sam')) run_successfully = indexSequenceBowtie2(referenceFile, threads) if run_successfully: command = ['bowtie2', '-k', str(numMapLoc), '-q', '', '--threads', str(threads), '-x', referenceFile, '', '--no-unal', '-S', sam_file] if len(fastq_files) == 1: command[9] = '-U ' + fastq_files[0] elif len(fastq_files) == 2: command[9] = '-1 ' + fastq_files[0] + ' -2 ' + fastq_files[1] else: return False, None if conserved_True: command[4] = '--sensitive' else: command[4] = '--very-sensitive-local' run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True) if not run_successfully: sam_file = None return run_successfully, sam_file
def include_rematch_dependencies_path(): original_rematch = None command = ['which', 'rematch.py'] run_successfully, stdout, stderr = utils.runCommandPopenCommunicate( command, False, None, False) if run_successfully: original_rematch = stdout.splitlines()[0] resource_rematch = None try: resource_rematch = resource_filename('ReMatCh', 'rematch.py') except ModuleNotFoundError: resource_rematch = original_rematch else: print('\n' 'Using ReMatCh "{resource_rematch}" via "{original_rematch}"\n'. format(resource_rematch=resource_rematch, original_rematch=original_rematch)) if resource_rematch is not None: utils.setPATHvariable(False, resource_rematch) else: sys.exit('ReMatCh not found in the PATH') return resource_rematch
def run_bowtie_inspect(index_without_sufix, outdir): """ Create the reference fasta file from Bowtie2 index files Parameters ---------- index_without_sufix : str Path to the basename of the index. The basename is name of any of the index files but with the .X.bt2 or .rev.X.bt2 suffix omitted. outdir : str Path to output directory where the fasta file will be stored Returns ------- run_successfully : bool Tells if the reference fasta file was successfully created reference_file : str Path to the newly created reference fasta file """ reference_file = os.path.join(outdir, os.path.basename(index_without_sufix)) run_successfully, _, _ = utils.runCommandPopenCommunicate([ 'bowtie2-inspect', '--across', 80, index_without_sufix, '>', reference_file ], True, None, True) return run_successfully, reference_file
def include_rematch_dependencies_path(): command = ['which', 'rematch.py'] run_successfully, stdout, stderr = utils.runCommandPopenCommunicate( command, False, None, False) if run_successfully: rematch = stdout.splitlines()[0] utils.setPATHvariable(False, rematch) return rematch
def sortAlignment(alignment_file, output_file, sortByName_True, threads): outFormat_string = os.path.splitext(output_file)[1][1:].lower() command = ['samtools', 'sort', '-o', output_file, '-O', outFormat_string, '', '-@', str(threads), alignment_file] if sortByName_True: command[6] = '-n' run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True) if not run_successfully: output_file = None return run_successfully, output_file
def split_bam(bam_file, list_sequences, outdir, threads): new_bam = os.path.join(outdir, 'partial.bam') command = [ 'samtools', 'view', '-b', '-u', '-h', '-o', new_bam, '-@', str(threads), bam_file, ' '.join(list_sequences) ] run_successfully, stdout, stderr = utils.runCommandPopenCommunicate( command, False, None, True) return run_successfully, new_bam
def include_rematch_dependencies_path(doNotUseProvidedSoftware): command = ['which', 'rematch.py'] run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, False) if run_successfully: rematch = stdout.splitlines()[0] path_variable = os.environ['PATH'] script_folder = os.path.dirname(rematch) if not doNotUseProvidedSoftware: bcftools = os.path.join(script_folder, 'src', 'bcftools-1.3.1', 'bin') os.environ['PATH'] = str(':'.join([bcftools, path_variable]))
def index_fasta_samtools(fasta, region_None, region_outfile_none, print_comand_True): command = ['samtools', 'faidx', fasta, '', '', ''] shell_true = False if region_None is not None: command[3] = region_None if region_outfile_none is not None: command[4] = '>' command[5] = region_outfile_none shell_true = True run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, shell_true, None, print_comand_True) return run_successfully, stdout
def include_rematch_dependencies_path(doNotUseProvidedSoftware): command = ['which', 'rematch.py'] run_successfully, stdout, stderr = utils.runCommandPopenCommunicate( command, False, None, False) if run_successfully: rematch = stdout.splitlines()[0] path_variable = os.environ['PATH'] script_folder = os.path.dirname(rematch) if not doNotUseProvidedSoftware: bcftools = os.path.join(script_folder, 'src', 'bcftools-1.3.1', 'bin') os.environ['PATH'] = str(':'.join([bcftools, path_variable]))
def run_blast_command(query_file, blast_db, db_type, blast_output, threads=1): """ Run Blast: blastn or blastp Parameters ---------- query_file : str Path to fasta file containing the query sequences, e.g. /input/queries.fasta blast_db : str Path to Blast DB files, e.g. /input/blast_db.nucl.sequences.fasta db_type : str Blast DB type. Can only be 'nucl' or 'prot' blast_output : str Path to Blast output tabular file threads : int Number of CPUs to use during Blast Returns ------- run_successfully : bool Tells if the Blast DB was successfully created """ # Check Blast DB type if db_type not in ('nucl', 'prot'): exit('Wrong Blast DB type provided ({db_type}).' ' Use one of the following: "nucl", "prot"'.format( db_type=db_type)) # Check if db_output dir exists if not os.path.isdir(os.path.dirname(blast_output)): os.makedirs(os.path.dirname(blast_output)) command = [ '', '-query', query_file, '-db', blast_db, '-out', blast_output, '-outfmt', '', '-dust', 'no', '-culling_limit', '1', '-num_threads', str(threads) ] if db_type == 'nucl': command[0] = 'blastn -task blastn' else: command[0] = 'blastp -task blastp' command[ 8] = "'7 qseqid qlen sseqid slen qstart qend sstart send evalue length pident nident mismatch gaps'" run_successfully, _, _ = utils.runCommandPopenCommunicate( command, False, None, True) return run_successfully
def run_bowtie_build(reference_file, outdir, threads=1): """ Create Bowtie2 index files Parameters ---------- reference_file : str Path to the reference sequences file outdir : str Path to output directory where the fasta file will be stored threads : int, default 1 Number of CPUs to use while creating Bowtie2 index Returns ------- run_successfully : bool Tells if the reference fasta file was successfully created """ index_basename = os.path.join(outdir, os.path.basename(reference_file)) run_successfully, _, _ = utils.runCommandPopenCommunicate([ 'bowtie2-build', '--threads', str(threads), reference_file, index_basename ], False, None, True) return run_successfully
def indexAlignment(alignment_file): command = ['samtools', 'index', alignment_file] run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True) return run_successfully
def create_blast_db(db_sequences, db_output, db_type): """ Creates a Blast DB Parameters ---------- db_sequences : str Path to fasta file containing the sequences from which Blast DB will be created, e.g. /input/sequences.fasta db_output : str Path to Blast output DB files, e.g. /output/blast_db.nucl.sequences.fasta db_type : str Blast DB type. Can only be 'nucl' or 'prot' Returns ------- run_successfully : bool Tells if the Blast DB was successfully created """ run_successfully = False # Check Blast DB type if db_type not in ('nucl', 'prot'): exit( 'Wrong Blast DB type provided ({db_type}). Use one of the following: nucl, prot' .format(db_type=db_type)) # Check if db_output dir exists if not os.path.isdir(os.path.dirname(db_output)): os.makedirs(os.path.dirname(db_output)) else: db_exists, original_file = check_db_exists(db_output) if db_exists and original_file: print('Blast DB already found at {db_output}' ' for {db_sequences}'.format( db_output=os.path.dirname(db_output), db_sequences=db_sequences, file_found=os.path.basename(db_sequences))) run_successfully = True elif db_exists and not original_file: print( 'The original fasta file ({db_sequences}) from which the Blast DB was produced is not present. Make' ' sure it is found in {db_dir} and it is' ' named {original_file_name}'.format( db_dir=os.path.dirname(db_output), db_sequences=db_sequences, original_file_name=os.path.basename(db_output))) elif not db_exists and original_file: print( 'The original fasta file ({db_sequences}) from which the Blast DB was supposed to be produced is' ' present in Blast DB directory ({db_dir}), but no Blast DB files were found' ' there.'.format(db_sequences=db_sequences, db_dir=os.path.dirname(db_output))) else: run_successfully, _, _ = utils.runCommandPopenCommunicate([ 'makeblastdb', '-parse_seqids', '-dbtype', db_type, '-in', db_sequences, '-out', db_output ], False, None, True) if run_successfully: from shutil import copyfile copyfile(db_sequences, db_output) return run_successfully
def main(): if sys.version_info[0] < 3: sys.exit('Must be using Python 3. Try calling "python3 get_stx_db.py"') parser = argparse.ArgumentParser( prog='get_stx_db.py', description= 'Gets STX sequences from virulencefinder_db to produce a STX subtyping' ' DB', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--version', help='Version information', action='version', version=str('%(prog)s v' + version)) parser_optional_general = parser.add_argument_group( 'General facultative options') parser_optional_general.add_argument( '-o', '--outdir', type=str, metavar='/path/to/output/directory/', help='Path to the directory where the sequences will be stored', required=False, default='.') args = parser.parse_args() start_time = time.time() args.outdir = os.path.abspath(args.outdir) if not os.path.isdir(args.outdir): os.makedirs(args.outdir) # Get virulencefinder_db url = 'https://bitbucket.org/genomicepidemiology/virulencefinder_db.git' virulencefinder_db = os.path.join(args.outdir, 'virulence_db', '') run_successfully, _, _ = utils.runCommandPopenCommunicate( ['git', 'clone', url, virulencefinder_db], False, None, True) _, commit, _ = utils.runCommandPopenCommunicate([ 'git', '-C', virulencefinder_db, 'log', '--pretty=format:"%h"', '-n', '1' ], True, 15, True) # Get STX sequences stx_seq = {} # stx_seq_write = [] allowed_chars = set(Seq.IUPAC.IUPACData.unambiguous_dna_letters) with open(os.path.join( args.outdir, 'virulence_db.virulence_ecoli.commit_{commit}.problematic_sequences.tab' .format(commit=commit)), 'wt', newline='\n') as writer: for seq in SeqIO.parse( os.path.join(virulencefinder_db, 'virulence_ecoli.fsa'), 'fasta'): if seq.id.lower().startswith('stx'): subtype = seq.id.split(':') if len(subtype) == 4: if seq.id[:4] not in stx_seq: stx_seq[seq.id[:4]] = [] ''' Jani After spending what seemed to be an endless amount of hours trying to solve the STEC stx subtype mystery I've come to the following conclusion. For the platform we need to combine in the target db stx2a, stx2c and stx2d as one subtype called stx2acd. This is due to the fact that all of these subtypes are the most potent ones to cause HUS and cannot be separated from each other by the methods in use right now. ''' if subtype[0][:4] == 'stx2' and subtype[3] in [ 'a', 'c', 'd' ]: subtype[3] = 'acd' subtype = subtype[0][:4] + subtype[3] # Define subtype # if subtype not in stx_seq[seq_name[3]]: # stx_seq[seq_name[3]][subtype] = [] seq.description = '' # To avoid description to be print in outfile # For sequences with IUPAC codes, use one possible sequence based on the one with the codes if not set(seq.seq.upper()).issubset(allowed_chars): # print(seq.id, set(seq.seq.upper())) all_possible_sequences = extend_ambiguous_dna( seq.seq.upper()) if all_possible_sequences is not None: seq = SeqRecord( Seq.Seq(all_possible_sequences[0], generic_dna), id='{seq_name}:IUPAC_codes_removed'.format( seq_name=seq.id), description='') # Change the sequence else: writer.write('\t'.join([ seq.id, 'Memory Error (too much IUPAC codes)' ])) continue seq.id = '{seq_name}:seqTyping_{subtype}'.format( seq_name=seq.id, subtype=subtype) stx_seq[seq.id[:4]].append(seq) # stx_seq_write.append(seq) # Write files for gene, seqs in stx_seq.items(): with open(os.path.join( args.outdir, 'virulence_db.virulence_ecoli.commit_{commit}.{gene}_subtyping.seq_typing.fasta' .format(commit=commit, gene=gene)), 'wt', newline='\n') as writer: _ = SeqIO.write(seqs, writer, "fasta") # print(len(stx_seq)) # for gene, subtype_dict in stx_seq.items(): # print(gene, len(subtype_dict)) # for subtype, seqs in subtype_dict.items(): # print(subtype, len(seqs)) _ = utils.runTime(start_time)