def call_external(cmd, logger, raise_exception=False): ''' Calls specified external command, waits for the process to finish and returns whatever the child process has returned. If 'raise_exception' is True a CalledProcessError exception is raised, which includes the return code, command and the output (stdout + stderr) of the external process. The call can be wrapped inside the try_and_except function for handling the possible raised exceptions (should be used with 'raise_exception'=True). Otherwise, the exception can be handled manually. @param cmd: Command to be ran with all appropriate arguments. The validity of the command the the arguments are not checked. @type cmd: arr. @param logger: Logger to be used for logging the output from the process. @type logger: logger. @param raise_exception: Specifies whether a CalledProcessError should be raised when returncode is not 0 (default False). @type raise_exception: bool. @return: Returns the returncode from the external process. @raise CalledProcessError: If returncode is not 0 and 'raise_exception' is set to True, then CalledProcessError will be raised. ''' import subprocess import log_writer # For now (v 2.7) can't use subprocess.call, subprocess.check_all because PIPE # is not correctly reading the method. For now, use Popen and wait(). process = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE) process.wait() # Log the outputs of the external programm. # FIXME: 'log_error_to' will always go to 'error' if exit code > 0 (see log_process) process_out, process_err = log_writer.log_process(logger, process, log_error_to="info") # Use stdout as output, unless err is not empty. In which case append it. # This may pollute the error log with stdout form the process. out = deepcopy(process_out) if process_err: out = out.join(["********ERROR********\n", process_err]) if raise_exception and process.returncode != 0: raise phe_exceptions.PheExternalError( "External script has returned non-zero exit code.", subprocess.CalledProcessError(process.returncode, cmd, out)) else: retval = { 'proc_returncode': process.returncode, 'proc_stdout': process_out, 'proc_stderr': process_err } return retval
def pileupReads(tmp_dir, sorted_bam_file, refFn, samtools, logger): """ Function Generate pileup file by using SAMtools mpileup command. NB: use -B -A -f option to optimises coverage and --A flag count anomalous read The option for method: tmp_dir[str]: the path to where pileup file will be created sorted_bam_file[str]: the path to the BAM file location refFn[str]: the path to the reference file location samtools[str]: the path to SAMtools command logger[str]: the path to where the stderr and stdout logged """ #1. Index bam file log_writer.info_header(logger, "index bam file") process = subprocess.Popen([samtools, 'index', sorted_bam_file], stderr=subprocess.PIPE, stdout=subprocess.PIPE) process.wait() log_writer.log_process(logger, process, log_error_to = "info") #2. Generate pileup file pileFn = os.path.join(tmp_dir, 'all.pileup') pileupFile = open(pileFn, 'w') log_writer.info_header(logger, "Generate pileup file") process = subprocess.Popen([samtools, 'mpileup', '-B', '-A', '-f', refFn, sorted_bam_file], stderr=subprocess.PIPE, stdout=subprocess.PIPE) for l in process.stdout: pileupFile.write(l) process.wait() log_writer.log_process(logger, process, log_error_to = "info") pileupFile.close()
def pileupReads(tmp_dir, sorted_bam_file, refFn, samtools, logger): """ Function Generate pileup file by using SAMtools mpileup command. NB: use -B -A -f option to optimises coverage and --A flag count anomalous read The option for method: tmp_dir[str]: the path to where pileup file will be created sorted_bam_file[str]: the path to the BAM file location refFn[str]: the path to the reference file location samtools[str]: the path to SAMtools command logger[str]: the path to where the stderr and stdout logged """ #1. Index bam file log_writer.info_header(logger, "index bam file") process = subprocess.Popen([samtools, 'index', sorted_bam_file], stderr=subprocess.PIPE, stdout=subprocess.PIPE) process.wait() log_writer.log_process(logger, process, log_error_to="info") #2. Generate pileup file pileFn = os.path.join(tmp_dir, 'all.pileup') pileupFile = open(pileFn, 'wb') log_writer.info_header(logger, "Generate pileup file") process = subprocess.Popen( [samtools, 'mpileup', '-B', '-A', '-f', refFn, sorted_bam_file], stderr=subprocess.PIPE, stdout=subprocess.PIPE) for l in process.stdout: pileupFile.write(l) process.wait() log_writer.log_process(logger, process, log_error_to="info") pileupFile.close()
def call_external(cmd, logger, raise_exception=False): ''' Calls specified external command, waits for the process to finish and returns whatever the child process has returned. If 'raise_exception' is True a CalledProcessError exception is raised, which includes the return code, command and the output (stdout + stderr) of the external process. The call can be wrapped inside the try_and_except function for handling the possible raised exceptions (should be used with 'raise_exception'=True). Otherwise, the exception can be handled manually. @param cmd: Command to be ran with all appropriate arguments. The validity of the command the the arguments are not checked. @type cmd: arr. @param logger: Logger to be used for logging the output from the process. @type logger: logger. @param raise_exception: Specifies whether a CalledProcessError should be raised when returncode is not 0 (default False). @type raise_exception: bool. @return: Returns the returncode from the external process. @raise CalledProcessError: If returncode is not 0 and 'raise_exception' is set to True, then CalledProcessError will be raised. ''' import subprocess import log_writer # For now (v 2.7) can't use subprocess.call, subprocess.check_all because PIPE # is not correctly reading the method. For now, use Popen and wait(). process = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE) process.wait() # Log the outputs of the external programm. # FIXME: 'log_error_to' will always go to 'error' if exit code > 0 (see log_process) process_out, process_err = log_writer.log_process(logger, process, log_error_to = "info") # Use stdout as output, unless err is not empty. In which case append it. # This may pollute the error log with stdout form the process. out = deepcopy(process_out) if process_err: out = out.join(["********ERROR********\n", process_err]) if raise_exception and process.returncode != 0: raise phe_exceptions.PheExternalError("External script has returned non-zero exit code.", subprocess.CalledProcessError(process.returncode, cmd, out)) else: retval={'proc_returncode' : process.returncode, 'proc_stdout' : process_out, 'proc_stderr' : process_err } return retval
def mapping(input_directory, fastqs, reference_fasta_file_path, output_dir, bowtie, samtools, id, logger): """ This function runs bowtie for mapping of the fastq files with the reference_fasta_fileerence file provided. :param fastqs: directory that contains two fastq files. It will be in the following format: id.workflow.version.suffix, e.g. 1.strep_pneumo.1_1.1.trimmed.fastq :type fastqs: directory :param reference_fasta_file: reference_fasta_file, this is defined in the find_serotype function. :type reference_fasta_file: file :param bowtie: path to bowtie :type bowtie: path :param samtools: path to samtools :type samtools: path :returns: sorted and indexed bam file. :rtype: file """ try: os.makedirs(output_dir + "/tmp") except OSError: if os.path.isdir(output_dir + "/tmp"): # We are nearly safe pass else: # There was an error on creation, so make sure we know about it raise null = open(os.devnull, 'w') bam_sorted = os.path.join(output_dir, id + '-sorted') bam_out = os.path.join(output_dir, id + '-sorted.bam') sam_parsed = os.path.join(output_dir + "/tmp", id + '.tmp') # temporary sam output sam = os.path.join(output_dir + "/tmp", id + '.sam') bam = os.path.join(output_dir + "/tmp", id + '.bam') # copy the reference fasta file to the tmp directory and index reference_fasta_file = output_dir + "/tmp/reference.fasta" shutil.copyfile(reference_fasta_file_path, reference_fasta_file) print "running bowtie index" bowtie_index = bowtie + "-build" log_writer.info_header(logger, "Creating reference_fasta_fileerence index") process = subprocess.Popen( [bowtie_index, reference_fasta_file, reference_fasta_file], stderr=subprocess.PIPE, stdout=subprocess.PIPE ) # generate index of reference_fasta_fileerence fasta for mapping process.wait() log_writer.log_process(logger, process) # # run bowtie cmd = [bowtie] cmd += [ '--fr', '--minins', '300', '--maxins', '1100', '-x', reference_fasta_file, '-1', fastqs[0], '-2', fastqs[1], '-S', sam, '-k', '99999', '-D', '20', '-R', '3', '-N', '0', '-L', '20', '-i', 'S,1,0.50' ] # write to tmp log_writer.info_header(logger, "Running bowtie to generate sam file") # print "running bowtie" process = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE) process.wait() log_writer.log_process(logger, process, log_error_to="info") # run remove_secondary_mapping_bit to deduct 256 from any bit that is above 256 using the sam file. The sam_parsed file is the output that is used to convert to bam. try_and_except(input_directory + "/logs/strep_pneumo_serotyping.stderr", remove_secondary_mapping_bit, sam, sam_parsed) log_writer.info_header(logger, "Convert sam to bam") process = subprocess.Popen( [samtools, 'view', '-bhS', '-o', bam, sam_parsed], stderr=subprocess.PIPE, stdout=subprocess.PIPE) process.wait() log_writer.log_process(logger, process, log_error_to="info") # sort bam log_writer.info_header(logger, "Sort the bam file") process = subprocess.Popen([samtools, 'sort', bam, bam_sorted], stderr=subprocess.PIPE, stdout=subprocess.PIPE) process.wait() log_writer.log_process(logger, process, log_error_to="info") # index bam log_writer.info_header(logger, "Index the BAM file") process = subprocess.Popen([samtools, 'index', bam_sorted + ".bam"], stderr=subprocess.PIPE, stdout=subprocess.PIPE) process.wait() log_writer.log_process(logger, process, log_error_to="info") return bam_out
def create_bam_file(tmp_dir, fastq_files, refFn, expand, bowtie, samtools, ids, logger): """ Function (1) Map each read set to each of the possible locus variants by calling Bowtie2 (with very sensitive options) and create tmp file (2) Convert the tmp to sam file by unset the secondary alignment bit score (3) Convert the sam to BAM file (4) sort BAM file The option for method: tmp_dir[str]: the path to where the tmp, SAM, BAM and sorted BAM files will be created fastq_files[list]: the path to the fastq file location refFn[str]: the path to the reference file location expand[str] : True bowtie[str]: the path to Bowtie2 command samtools[str]: the path to SAMtools command ids[str]: unique identifier number logger[str]: the path to where the stderr and stdout logged Return sorted_bam_file[str]: sorted BAM file """ tmp = os.path.join(tmp_dir, ids +'.tmp') # temporary sam output sam = os.path.join(tmp_dir, ids + '.sam') if expand: #1. Creating tmp file log_writer.info_header(logger, "Creating tmp file") # -k = report up to 99999 good alignments per read. #--very-sensitive option = -D 20 -R 3 -N 0 -L 20 -i S,1,0.50 process = subprocess.Popen([bowtie, '--fr', '--no-unal', '--minins', '300', '--maxins', '1100', '-x', refFn, '-1', fastq_files[0], '-2', fastq_files[1], '-S', tmp, '-k', '99999', '-D', '20', '-R', '3', '-N', '0', '-L', '20', '-i', 'S,1,0.50'], stderr=subprocess.PIPE, stdout=subprocess.PIPE) process.wait() log_writer.log_process(logger, process, log_error_to = "info") #2.remove_secondary_mapping_bit log_writer.info_header(logger, "remove_secondary_mapping_bit") i = open(tmp) o = open(sam, 'w') remove_secondary_mapping_bit(tmp, sam) i.close() o.close() else: log_writer.info_header(logger, "Creating sam file") process= subprocess.Popen([bowtie, '--fr', '--no-unal', '--minins', '300', '--maxins', '1100', '-x', refFn, '-1', fastq_files[0], '-2', fastq_files[1], '-S', sam, '-k', '99999', '-D', '20', '-R', '3', '-N', '0', '-L', '20', '-i', 'S,1,0.50'], stderr=subprocess.PIPE, stdout=subprocess.PIPE) process.wait() log_writer.log_process(logger, process, log_error_to = "info") #3.Converting sam to bam file bam = os.path.join(tmp_dir, ids + '.unsortedbam') log_writer.info_header(logger, "Converting sam to bam") process = subprocess.Popen([samtools, 'view', '-bhS', '-o', bam, sam], stderr=subprocess.PIPE, stdout=subprocess.PIPE) process.wait() log_writer.log_process(logger, process, log_error_to = "info") #4.Sort bam file out0 = os.path.join(tmp_dir,ids + '-all') log_writer.info_header(logger, "Sorting bam") sorted_bam_file = os.path.join(tmp_dir, ids + '-all.bam') process = subprocess.Popen([samtools, 'sort', bam, out0], stderr=subprocess.PIPE, stdout=subprocess.PIPE) process.wait() log_writer.log_process(logger, process, log_error_to = "info") return sorted_bam_file
def create_bam_file(tmp_dir, fastq_files, refFn, expand, bowtie, samtools, ids, logger): """ Function (1) Map each read set to each of the possible locus variants by calling Bowtie2 (with very sensitive options) and create tmp file (2) Convert the tmp to sam file by unset the secondary alignment bit score (3) Convert the sam to BAM file (4) sort BAM file The option for method: tmp_dir[str]: the path to where the tmp, SAM, BAM and sorted BAM files will be created fastq_files[list]: the path to the fastq file location refFn[str]: the path to the reference file location expand[str] : True bowtie[str]: the path to Bowtie2 command samtools[str]: the path to SAMtools command ids[str]: unique identifier number logger[str]: the path to where the stderr and stdout logged Return sorted_bam_file[str]: sorted BAM file """ tmp = os.path.join(tmp_dir, ids + '.tmp') # temporary sam output sam = os.path.join(tmp_dir, ids + '.sam') if expand: #1. Creating tmp file log_writer.info_header(logger, "Creating tmp file") # -k = report up to 99999 good alignments per read. #--very-sensitive option = -D 20 -R 3 -N 0 -L 20 -i S,1,0.50 process = subprocess.Popen([ bowtie, '--fr', '--no-unal', '--minins', '300', '--maxins', '1100', '-x', refFn, '-1', fastq_files[0], '-2', fastq_files[1], '-S', tmp, '-k', '99999', '-D', '20', '-R', '3', '-N', '0', '-L', '20', '-i', 'S,1,0.50' ], stderr=subprocess.PIPE, stdout=subprocess.PIPE) process.wait() log_writer.log_process(logger, process, log_error_to="info") #2.remove_secondary_mapping_bit log_writer.info_header(logger, "remove_secondary_mapping_bit") i = open(tmp) o = open(sam, 'w') remove_secondary_mapping_bit(tmp, sam) i.close() o.close() else: log_writer.info_header(logger, "Creating sam file") process = subprocess.Popen([ bowtie, '--fr', '--no-unal', '--minins', '300', '--maxins', '1100', '-x', refFn, '-1', fastq_files[0], '-2', fastq_files[1], '-S', sam, '-k', '99999', '-D', '20', '-R', '3', '-N', '0', '-L', '20', '-i', 'S,1,0.50' ], stderr=subprocess.PIPE, stdout=subprocess.PIPE) process.wait() log_writer.log_process(logger, process, log_error_to="info") #3.Converting sam to bam file bam = os.path.join(tmp_dir, ids + '.unsortedbam') log_writer.info_header(logger, "Converting sam to bam") process = subprocess.Popen([samtools, 'view', '-bhS', '-o', bam, sam], stderr=subprocess.PIPE, stdout=subprocess.PIPE) process.wait() log_writer.log_process(logger, process, log_error_to="info") #4.Sort bam file out0 = os.path.join(tmp_dir, ids + '-all') log_writer.info_header(logger, "Sorting bam") sorted_bam_file = os.path.join(tmp_dir, ids + '-all.bam') process = subprocess.Popen([samtools, 'sort', bam, out0], stderr=subprocess.PIPE, stdout=subprocess.PIPE) process.wait() log_writer.log_process(logger, process, log_error_to="info") return sorted_bam_file
def flanking_regions(profile_file_directory, output_directory, logger): """ Function (1) Extract flanking regions of 100bp upstream and downstream of each MLST locus by blast against a reference genome.BLAST uses the first locus sequence as a query. (2) Creates summary.txt file (a tab-delimited text file display the path to the loci and flanking sequences) The option of the method profile_file_directory[str]: The path to the reference.seq, profile.txt and the Locus variant sequences (*.fas) files location output_directory[str]: The path to where the summary.txt file will be created logger[str]: The path to where the stderr and stdout logged """ reference_fasta_file = profile_file_directory + "/reference.seq" refseq_record = SeqIO.read(reference_fasta_file, "fasta", generic_dna) locus_files = glob.glob(profile_file_directory + "/*.fas") locus_files = sorted(locus_files) summary_file_handle = open(output_directory + "/summary.txt", "w") for seq in locus_files: (seqDir, seqFileName) = os.path.split(seq) (seqBaseName, ext) = os.path.splitext(seqFileName) bait = seqBaseName + "_bait.fasta" log_writer.info_header(logger, "create bait file") process = subprocess.Popen([ 'seqret', seq, '-firstonly', '-auto', '-out', output_directory + '/' + bait ], stderr=subprocess.PIPE, stdout=subprocess.PIPE) process.wait() log_writer.log_process(logger, process, log_error_to="info") cline = NcbiblastnCommandline(query=output_directory + '/' + bait, db=profile_file_directory + "/reference", evalue=0.001, out=output_directory + "/my_blast_tmp.xml", outfmt=5) stdout_log_output, stderr_log_output = cline() result_handle = open(output_directory + "/my_blast_tmp.xml") blast_record = NCBIXML.read(result_handle) query_length = blast_record.query_letters for alignment in blast_record.alignments: hsp = alignment.hsps[0] if hsp.align_length / float(query_length) > 0.5: if hsp.sbjct_start > hsp.sbjct_end: subject_start = hsp.sbjct_start + (hsp.query_start - 1) else: subject_start = hsp.sbjct_start - (hsp.query_start - 1) if hsp.sbjct_start > hsp.sbjct_end: subject_end = hsp.sbjct_end - (query_length - hsp.query_end) else: subject_end = hsp.sbjct_end + (query_length - hsp.query_end) revcomp = 1 if hsp.sbjct_start > hsp.sbjct_end: revcomp = -1 left_coords = [ min(subject_start, subject_end) - 100, min(subject_start, subject_end) - 1 ] right_coords = [ max(subject_start, subject_end) + 1, max(subject_start, subject_end) + 100 ] left_cmd = [ "seqret ", reference_fasta_file, " -sbegin ", str(left_coords[0]), " -send ", str(left_coords[1]), " -osformat fasta -auto -out " + output_directory + "/tmp_left_flank.fasta" ] os.system(''.join(left_cmd)) right_cmd = [ "seqret ", reference_fasta_file, " -sbegin ", str(right_coords[0]), " -send ", str(right_coords[1]), " -osformat fasta -auto -out " + output_directory + "/tmp_right_flank.fasta" ] os.system(''.join(right_cmd)) left_record = SeqIO.read( output_directory + "/tmp_left_flank.fasta", "fasta") if revcomp < 0: left_record.id = "down" left_record.seq = left_record.seq.reverse_complement() else: left_record.id = "up" right_record = SeqIO.read( output_directory + "/tmp_right_flank.fasta", "fasta") if revcomp < 0: right_record.id = "up" right_record.seq = right_record.seq.reverse_complement() else: right_record.id = "down" right_record.description = "" left_record.description = "" out_handle = open( output_directory + "/" + seqBaseName + "_flanks.fasta", "w") out_handle.write(right_record.format("fasta")) out_handle.write(left_record.format("fasta")) out_handle.close() summary_file_handle.write('\t'.join([ seqBaseName, seq, output_directory + "/" + seqBaseName + "_flanks.fasta" ]) + "\n") summary_file_handle.close()
def concatenate_flanking_regions(specFn, tmp_dir, bowtie, logger): """ Function (1) Concatenate flanking regions to correspondent locus variants sequence in fasta format. Newly concatenated sequence are then indexed by Bowtie2 (2) Then extract and store as pickled object: a. locus- variant names (loci.pkl) b. start and end position of locus variant sequences (without the flanking sequences)(ranges.pkl) c. Locus variants sequence (refSeqs.pkl) The option of the method specFn[str]: A tab-delimited text file display the path to the seven flanking and loci sequences(summary.txt) tmp_dir[str] The path to where refSeqs.pkl, ranges.pkl and loci.pkl will be created bowtie[str]: The command used to index the reference sequence logger[str]: The path to where the stderr and stdout logged Return loci[list]: loci name """ (specDir, summaryFileName) = os.path.split(specFn) spc = [] for l in open(specFn): spc.append(l.split()) refFn = os.path.join(tmp_dir, "reference.fa") rf = open(refFn, "w") ranges = {} loci = [] refSeqs = {} for (loc, variantsFn, flanksFn) in spc: loci.append(loc) fs = {} f = open(os.path.join(specDir, flanksFn)) for r in Bio.SeqIO.parse(f, "fasta"): fs[r.id] = r.seq f = open(os.path.join(specDir, variantsFn)) for r in Bio.SeqIO.parse(f, "fasta"): s = Bio.Seq.MutableSeq('', Bio.Alphabet.generic_dna) s += fs['up'] s += r.seq s += fs['down'] Bio.SeqIO.write([Bio.SeqRecord.SeqRecord(s, id=r.id)], rf, "fasta") ranges[r.id] = (len(fs['up']), len(fs['up']) + len(r.seq)) refSeqs[r.id] = s rf.close() rangesFn = os.path.join( tmp_dir, "ranges.pkl" ) #start and end position of locus variant sequences (without the flanking sequences) f = open(rangesFn, 'w') pickle.dump(ranges, f) f.close() lociFn = os.path.join(tmp_dir, "loci.pkl") f = open(lociFn, 'w') pickle.dump(loci, f) f.close() refSeqsFn = os.path.join(tmp_dir, "refSeqs.pkl") #Locus variants sequence f = open(refSeqsFn, 'w') pickle.dump(refSeqs, f) f.close() bowtie2_index = bowtie + "-build" log_writer.info_header(logger, "bowtie_indexed") process = subprocess.Popen( [bowtie2_index, refFn, refFn], stderr=subprocess.PIPE, stdout=subprocess.PIPE ) # generate index of reference fasta for mapping process.wait() log_writer.log_process(logger, process, log_error_to="info") os.system("rm -f summary.txt")
def mapping(input_directory, fastqs, reference_fasta_file_path, output_dir, bowtie, samtools, id, logger, threads): """ This function runs bowtie for mapping of the fastq files with the reference_fasta_fileerence file provided. :param fastqs: directory that contains two fastq files. It will be in the following format: id.workflow.version.suffix, e.g. 1.strep_pneumo.1_1.1.trimmed.fastq :type fastqs: directory :param reference_fasta_file: reference_fasta_file, this is defined in the find_serotype function. :type reference_fasta_file: file :param bowtie: path to bowtie :type bowtie: path :param samtools: path to samtools :type samtools: path :returns: sorted and indexed bam file. :rtype: file """ try: os.makedirs(output_dir + "/tmp") except OSError: if os.path.isdir(output_dir + "/tmp"): # We are nearly safe pass else: # There was an error on creation, so make sure we know about it raise null = open(os.devnull, 'w') bam_sorted = os.path.join(output_dir, id + '-sorted') bam_out = os.path.join(output_dir, id + '-sorted.bam') sam_parsed = os.path.join(output_dir + "/tmp", id + '.tmp') # temporary sam output sam = os.path.join(output_dir + "/tmp", id + '.sam') bam = os.path.join(output_dir + "/tmp", id + '.bam') # copy the reference fasta file to the tmp directory and index reference_fasta_file = output_dir + "/tmp/reference.fasta" shutil.copyfile(reference_fasta_file_path, reference_fasta_file) print "running bowtie index" bowtie_index= bowtie + "-build" log_writer.info_header(logger, "Creating reference_fasta_fileerence index") process = subprocess.Popen([bowtie_index, reference_fasta_file, reference_fasta_file], stderr=subprocess.PIPE, stdout=subprocess.PIPE) # generate index of reference_fasta_fileerence fasta for mapping process.wait() log_writer.log_process(logger, process) # # run bowtie cmd = [bowtie] cmd += [ '--fr', '--minins', '300', '--maxins', '1100', '-x', reference_fasta_file, '-1', fastqs[0], '-2', fastqs[1],'-S', sam, '-k', '99999', '-D', '20', '-R', '3', '-N', '0', '-L', '20', '-i', 'S,1,0.50', '-p', str(threads)] # write to tmp log_writer.info_header(logger, "Running bowtie to generate sam file") # print "running bowtie" process = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE) process.wait() log_writer.log_process(logger, process, log_error_to = "info") # run remove_secondary_mapping_bit to deduct 256 from any bit that is above 256 using the sam file. The sam_parsed file is the output that is used to convert to bam. try_and_except(input_directory + "/logs/strep_pneumo_serotyping.stderr", remove_secondary_mapping_bit, sam, sam_parsed) log_writer.info_header(logger, "Convert sam to bam") process = subprocess.Popen([samtools, 'view', '-bhS', '-o', bam, '-@', str(threads), sam_parsed], stderr=subprocess.PIPE, stdout=subprocess.PIPE) process.wait() log_writer.log_process(logger, process, log_error_to = "info") # sort bam log_writer.info_header(logger, "Sort the bam file") process = subprocess.Popen([samtools, 'sort', '-o', bam_sorted + ".bam", '-@', str(threads), bam], stderr=subprocess.PIPE, stdout=subprocess.PIPE) process.wait() log_writer.log_process(logger, process, log_error_to = "info") # index bam log_writer.info_header(logger, "Index the BAM file") process = subprocess.Popen([samtools, 'index', bam_sorted + ".bam"], stderr=subprocess.PIPE, stdout=subprocess.PIPE) process.wait() log_writer.log_process(logger, process, log_error_to = "info") return bam_out
def getNovelAllele(variant, locus, fastq_files, bowtie, samtools, ids, tmp_dir, logger): """ Function Generate SAM, BAM and pileup file for novel allele The option for method: variant[str]: locus variant number locus[str]: locus name fastq_files[str]: the path to the fastq file bowtie[str]: the path to Bowtie2 command samtools[str]: the path to SAMtools command ids[str]: sample unique identifier number tmp_dir[str]: the path to where SAM, BAM, Pileup will be created """ refSeqs = pickle.load(open(os.path.join(tmp_dir, "refSeqs.pkl"))) allele_name = locus + "-" + variant typeFn = os.path.join(tmp_dir, allele_name + ".fa") typeFile = open(typeFn, "w") s = refSeqs[allele_name] Bio.SeqIO.write([Bio.SeqRecord.SeqRecord(s, id=allele_name)], typeFile, "fasta") typeFile.close() #index refrence sample bowtie2_index = bowtie + "-build" log_writer.info_header(logger, "index refrence sample") process = subprocess.Popen([bowtie2_index, typeFn, typeFn], stderr=subprocess.PIPE, stdout=subprocess.PIPE)## index ref process.wait() log_writer.log_process(logger, process, log_error_to = "info") # create sam and bam files log_writer.info_header(logger, "Creating sam and bam files") bam = create_bam_file(tmp_dir, fastq_files, typeFn, False, bowtie, samtools, ids, logger) #name bam file bamFn= os.path.join(tmp_dir, ids + "." + allele_name + ".bam") process = subprocess.Popen(['mv',bam,bamFn],stderr=subprocess.PIPE, stdout=subprocess.PIPE) process.wait() log_writer.log_process(logger, process, log_error_to = "info") #index bam file process = subprocess.Popen([samtools, 'index', bamFn], stderr=subprocess.PIPE, stdout=subprocess.PIPE) process.wait() log_writer.log_process(logger, process, log_error_to = "info") log_writer.info_header(logger, "index bam file") #generate pilup file piFn= os.path.join(tmp_dir, ids + "." + allele_name + '.pileup') f = open(piFn, 'w') log_writer.info_header(logger, "generate pileup file") process = subprocess.Popen([samtools, 'mpileup', '-B', '-A', '-cf', typeFn, bamFn], stderr=subprocess.PIPE, stdout=subprocess.PIPE)# generate pileup for l in process.stdout: f.write(l) f.close() process.wait()
def getNovelAllele(variant, locus, fastq_files, bowtie, samtools, ids, tmp_dir, logger): """ Function Generate SAM, BAM and pileup file for novel allele The option for method: variant[str]: locus variant number locus[str]: locus name fastq_files[str]: the path to the fastq file bowtie[str]: the path to Bowtie2 command samtools[str]: the path to SAMtools command ids[str]: sample unique identifier number tmp_dir[str]: the path to where SAM, BAM, Pileup will be created """ refSeqs = pickle.load(open(os.path.join(tmp_dir, "refSeqs.pkl"),'rb')) allele_name = locus + "-" + variant typeFn = os.path.join(tmp_dir, allele_name + ".fa") typeFile = open(typeFn, "w") s = refSeqs[allele_name] Bio.SeqIO.write([Bio.SeqRecord.SeqRecord(s, id=allele_name)], typeFile, "fasta") typeFile.close() #index refrence sample bowtie2_index = bowtie + "-build" log_writer.info_header(logger, "index refrence sample") process = subprocess.Popen([bowtie2_index, typeFn, typeFn], stderr=subprocess.PIPE, stdout=subprocess.PIPE)## index ref process.wait() log_writer.log_process(logger, process, log_error_to = "info") # create sam and bam files log_writer.info_header(logger, "Creating sam and bam files") bam = create_bam_file(tmp_dir, fastq_files, typeFn, False, bowtie, samtools, ids, logger) #name bam file bamFn= os.path.join(tmp_dir, ids + "." + allele_name + ".bam") process = subprocess.Popen(['mv',bam,bamFn],stderr=subprocess.PIPE, stdout=subprocess.PIPE) process.wait() log_writer.log_process(logger, process, log_error_to = "info") #index bam file process = subprocess.Popen([samtools, 'index', bamFn], stderr=subprocess.PIPE, stdout=subprocess.PIPE) process.wait() log_writer.log_process(logger, process, log_error_to = "info") log_writer.info_header(logger, "index bam file") #generate pilup file piFn= os.path.join(tmp_dir, ids + "." + allele_name + '.pileup') f = open(piFn, 'wb') log_writer.info_header(logger, "generate pileup file") process = subprocess.Popen([samtools, 'mpileup', '-B', '-A', '-cf', typeFn, bamFn], stderr=subprocess.PIPE, stdout=subprocess.PIPE)# generate pileup for l in process.stdout: f.write(l) f.close() process.wait()
def flanking_regions(profile_file_directory, output_directory, logger): """ Function (1) Extract flanking regions of 100bp upstream and downstream of each MLST locus by blast against a reference genome.BLAST uses the first locus sequence as a query. (2) Creates summary.txt file (a tab-delimited text file display the path to the loci and flanking sequences) The option of the method profile_file_directory[str]: The path to the reference.seq, profile.txt and the Locus variant sequences (*.fas) files location output_directory[str]: The path to where the summary.txt file will be created logger[str]: The path to where the stderr and stdout logged """ reference_fasta_file = profile_file_directory + "/reference.seq" refseq_record = SeqIO.read(reference_fasta_file, "fasta", generic_dna) locus_files = glob.glob(profile_file_directory + "/*.fas") locus_files = sorted(locus_files) summary_file_handle = open(output_directory + "/summary.txt", "w") for seq in locus_files: (seqDir,seqFileName) = os.path.split(seq) (seqBaseName,ext) = os.path.splitext(seqFileName) bait = seqBaseName + "_bait.fasta" log_writer.info_header(logger, "create bait file") process = subprocess.Popen(['seqret',seq,'-firstonly','-auto','-out',output_directory+ '/' + bait], stderr=subprocess.PIPE, stdout=subprocess.PIPE) process.wait() log_writer.log_process(logger, process, log_error_to = "info") cline = NcbiblastnCommandline(query=output_directory+ '/' + bait, db=profile_file_directory + "/reference",evalue=0.001, out=output_directory + "/my_blast_tmp.xml", outfmt=5) stdout_log_output, stderr_log_output = cline() result_handle = open(output_directory + "/my_blast_tmp.xml") blast_record = NCBIXML.read(result_handle) query_length = blast_record.query_letters for alignment in blast_record.alignments: hsp = alignment.hsps[0] if hsp.align_length/float(query_length) > 0.5: if hsp.sbjct_start > hsp.sbjct_end: subject_start = hsp.sbjct_start + (hsp.query_start - 1) else: subject_start = hsp.sbjct_start - (hsp.query_start - 1) if hsp.sbjct_start > hsp.sbjct_end: subject_end = hsp.sbjct_end - (query_length - hsp.query_end) else: subject_end = hsp.sbjct_end + (query_length - hsp.query_end) revcomp = 1 if hsp.sbjct_start > hsp.sbjct_end: revcomp = -1 left_coords = [min(subject_start,subject_end)-100,min(subject_start,subject_end)-1] right_coords = [max(subject_start,subject_end)+1,max(subject_start,subject_end)+100] left_cmd = ["seqret ",reference_fasta_file," -sbegin ",str(left_coords[0])," -send ",str(left_coords[1])," -osformat fasta -auto -out " + output_directory + "/tmp_left_flank.fasta"] os.system(''.join(left_cmd)) right_cmd = ["seqret ",reference_fasta_file," -sbegin ",str(right_coords[0])," -send ",str(right_coords[1])," -osformat fasta -auto -out " + output_directory + "/tmp_right_flank.fasta"] os.system(''.join(right_cmd)) left_record = SeqIO.read(output_directory + "/tmp_left_flank.fasta", "fasta") if revcomp < 0: left_record.id = "down" left_record.seq = left_record.seq.reverse_complement() else: left_record.id = "up" right_record = SeqIO.read(output_directory + "/tmp_right_flank.fasta", "fasta") if revcomp < 0: right_record.id = "up" right_record.seq = right_record.seq.reverse_complement() else: right_record.id = "down" right_record.description = "" left_record.description = "" out_handle = open(output_directory + "/" + seqBaseName + "_flanks.fasta", "w") out_handle.write(right_record.format("fasta")) out_handle.write(left_record.format("fasta")) out_handle.close() summary_file_handle.write('\t'.join([seqBaseName,seq,output_directory + "/" + seqBaseName + "_flanks.fasta"]) + "\n") summary_file_handle.close()
def concatenate_flanking_regions(specFn, tmp_dir, bowtie, logger): """ Function (1) Concatenate flanking regions to correspondent locus variants sequence in fasta format. Newly concatenated sequence are then indexed by Bowtie2 (2) Then extract and store as pickled object: a. locus- variant names (loci.pkl) b. start and end position of locus variant sequences (without the flanking sequences)(ranges.pkl) c. Locus variants sequence (refSeqs.pkl) The option of the method specFn[str]: A tab-delimited text file display the path to the seven flanking and loci sequences(summary.txt) tmp_dir[str] The path to where refSeqs.pkl, ranges.pkl and loci.pkl will be created bowtie[str]: The command used to index the reference sequence logger[str]: The path to where the stderr and stdout logged Return loci[list]: loci name """ (specDir,summaryFileName) = os.path.split(specFn) spc = [] for l in open(specFn): spc.append(l.split()) refFn = os.path.join(tmp_dir, "reference.fa") rf = open(refFn, "w") ranges = {} loci = [] refSeqs = {} for (loc, variantsFn, flanksFn) in spc: loci.append(loc) fs = {} f = open(os.path.join(specDir, flanksFn)) for r in Bio.SeqIO.parse(f, "fasta"): fs[r.id] = r.seq f = open(os.path.join(specDir, variantsFn)) for r in Bio.SeqIO.parse(f, "fasta"): s = Bio.Seq.MutableSeq('', Bio.Alphabet.generic_dna) s += fs['up'] s += r.seq s += fs['down'] Bio.SeqIO.write([Bio.SeqRecord.SeqRecord(s, id=r.id)], rf, "fasta") ranges[r.id] = (len(fs['up']), len(fs['up']) + len(r.seq)) refSeqs[r.id] = s rf.close() rangesFn = os.path.join(tmp_dir, "ranges.pkl") #start and end position of locus variant sequences (without the flanking sequences) f = open(rangesFn, 'w') pickle.dump(ranges, f) f.close() lociFn = os.path.join(tmp_dir, "loci.pkl") f = open(lociFn, 'w') pickle.dump(loci, f) f.close() refSeqsFn = os.path.join(tmp_dir, "refSeqs.pkl") #Locus variants sequence f = open(refSeqsFn, 'w') pickle.dump(refSeqs, f) f.close() bowtie2_index = bowtie + "-build" log_writer.info_header(logger, "bowtie_indexed") process = subprocess.Popen([bowtie2_index, refFn, refFn], stderr=subprocess.PIPE, stdout=subprocess.PIPE) # generate index of reference fasta for mapping process.wait() log_writer.log_process(logger, process, log_error_to = "info") os.system("rm -f summary.txt")