def main(): 'The script itself' #set parameters work_dir, output, reference = set_parameters() # make a working tempfir temp_dir = NamedTemporaryDir() # add readgroup tag to each alignment in bam add_header_and_tags_bams(work_dir, temp_dir.name) # Prepare files to merge sams = get_opened_sams_from_dir(temp_dir.name) temp_sam = NamedTemporaryFile() # merge all the sam in one merge_sam(sams, temp_sam, reference) # Convert sam into a bam,(Temporary) temp_bam = NamedTemporaryFile(suffix='.bam') sam2bam(temp_sam.name, temp_bam.name) # finally we need to order the bam sort_bam_sam(temp_bam.name, output) # and make and index of the bam call(['samtools', 'index', output], raise_on_error=True) temp_dir.close()
def main(): "The real script" #set parameters bamfiles, pileup, reffile = set_parameters() # TEMPORARY DIR NAME temp_dir = NamedTemporaryDir() dir_name = temp_dir.name #merge bam files if len(bamfiles) == 1: merged_bam_file = bamfiles[0] else: merged_bam_file = os.path.join(dir_name, 'merged.bam') cmd = ['samtools', 'merge', merged_bam_file].extend(bamfiles) stdout, stderr, retcode = call(cmd) if retcode: raise RuntimeError('samtools pileup - step error: %s' % stderr) #multiple alignment cmd = ['samtools', 'pileup', '-f', reffile, merged_bam_file] stdout, stderr, retcode = call(cmd) if retcode: msg = 'Error:\nstep:%s\n error: %s' % (" ".join(cmd), stderr) raise RuntimeError(msg) else: pileup.write(stdout) pileup.close()
def create_sam_reference_index(reference_fpath): 'It creates a sam index for a reference sequence file' index_fpath = reference_fpath + '.fai' if os.path.exists(index_fpath): return cmd = ['samtools', 'faidx', reference_fpath] call(cmd, raise_on_error=True)
def map_reads_with_gmap(reference_fpath, reads_fpath, out_bam_fpath, parameters): 'It maps the reads with gmap' threads = parameters['threads'] tmp_dir = parameters['tmp_dir'] if 'tmp_dir' in parameters else None reference_dir, reference_file_name = os.path.split(reference_fpath) reference_name = reference_file_name.split('.')[0] if not reference_dir: reference_dir = '.' if not os.path.exists(os.path.join(reference_dir, reference_name, reference_name + '.chromosome')): create_gmap_reference(reference_dir, reference_fpath, reference_name, parameters) cmd = ['gmap', '-d', reference_name, '-D', reference_dir, '-f', 'samse'] # this gmap options doesn' detect deletions close to introns cmd.append('--canonical-mode=0') if threads: cmd.extend(['-t', str(threads)]) cmd.append(reads_fpath) out_sam_fhand = NamedTemporaryFile(suffix='.sam', dir=tmp_dir) call(cmd, stdout=out_sam_fhand, raise_on_error=True) if 'unmapped_fhand' in parameters and parameters['unmapped_fhand'] is not None: out_sam_fhand2 = NamedTemporaryFile(dir=tmp_dir, suffix='.sam') get_out_unmapped(out_sam_fhand, parameters['unmapped_fhand'], out_sam_fhand2) out_sam_fhand = out_sam_fhand2 sam2bam(out_sam_fhand.name, out_bam_fpath) out_sam_fhand.close()
def bamsam_converter(input_fhand, output_fhand, java_conf=None): 'Converts between sam and bam' picard_path = guess_jar_dir('SortSam.jar', java_conf) picard_jar = os.path.join(picard_path, 'SamFormatConverter.jar') cmd = java_cmd(java_conf) cmd.extend(['-jar', picard_jar, 'INPUT=' + input_fhand, 'OUTPUT=' + output_fhand]) call(cmd, raise_on_error=True, add_ext_dir=False)
def create_bam_index(bam_fpath): 'It creates an index of the bam if it does not exist' index_fpath = bam_fpath + '.bai' if os.path.exists(index_fpath): return cmd = ['samtools', 'index', bam_fpath] call(cmd, raise_on_error=True)
def bam2sam(bam_path, sam_path, header=False): '''It converts between bam and sam.''' cmd = ['samtools', 'view'] if header: cmd.append('-h') cmd.append(bam_path) cmd.extend(['-o', sam_path]) call(cmd, raise_on_error=True)
def main(): "The real script" seqfile, reffile, seq_type, refdb, bamfile = set_parameters() # we need that the index is created if not os.path.exists(refdb + '.bwt'): raise OSError('Reference database index not created: %s' % refdb) # TEMPORARY DIR NAME temp_dir = NamedTemporaryDir() dir_name = temp_dir.name if seq_type == 'short': cmd = ['bwa', 'aln', refdb, seqfile] stdout, stderr, retcode = call(cmd) if retcode: print stderr raise RuntimeError(" ".join(cmd)) sai_fhand = open(os.path.join(dir_name, 'output.sai'), 'wb') sai_fhand.write(stdout) sai_fhand.close() cmd = ['bwa', 'samse', refdb, sai_fhand.name, seqfile] stdout, stderr, retcode = call(cmd) if retcode: print stderr raise RuntimeError(" ".join(cmd)) ali_fhand = open(os.path.join(dir_name, 'output.ali'), 'w') ali_fhand.write(stdout) ali_fhand.close() elif seq_type == 'long': #align sanger cmd = ['bwa', 'dbwtsw', refdb, seqfile] stdout, stderr, retcode = call(cmd) if retcode: print stderr raise RuntimeError(" ".join(cmd)) ali_fhand = open(os.path.join(dir_name, 'output.ali'), 'w') ali_fhand.write(stdout) ali_fhand.close() else: raise ValueError('Seq type: short or long') #from sam import to bam cmd = ['samtools', 'view' , '-bt', reffile, '-o', os.path.join(dir_name, 'bam_file.bam'), ali_fhand.name] stdout, stderr, retcode = call(cmd) if retcode: print stderr raise RuntimeError(" ".join(cmd)) #sort the bam cmd = ['samtools', 'sort', os.path.join(dir_name, 'bam_file.bam'), bamfile] stdout, stderr, retcode = call(cmd) if retcode: print stderr raise RuntimeError(" ".join(cmd))
def blast_runner_plus(seq_fpath, blast_db, blast_type, result_fpath, threads=False): 'It runs a blast giving a file and a database path' cmd = [blast_type, '-db', blast_db, '-num_alignments', '25', '-num_descriptions', '25', '-evalue', '0.0001', '-outfmt', '5', '-query', seq_fpath, '-out', result_fpath] if threads: cmd.extend(['-num_threads', str(threads)]) call(cmd, raise_on_error=True, log=True)
def create_bowtie_reference(reference_fpath, color=False): 'It creates the bowtie index used by bowtie and tophat' bowtie_index = os.path.splitext(reference_fpath)[0] cmd = ['bowtie-build'] if color: cmd.append('-C') cmd.extend([reference_fpath, bowtie_index]) call(cmd, raise_on_error=True)
def compress_and_index_vcf(vcf_fpath): '''It indexes the vcf file using tabix and bgzip. the indexes file will be vcf_filename.gz ''' cmd = ['bgzip', '-f', vcf_fpath] call(cmd, raise_on_error=True) cmd = ['tabix', '-p', 'vcf', '-f', '{0:s}.gz'.format(vcf_fpath)] call(cmd, raise_on_error=True)
def create_picard_dict(reference_fpath, java_conf=None): 'It creates a picard dict if if it does not exist' dict_path = os.path.splitext(reference_fpath)[0] + '.dict' if os.path.exists(dict_path): return picard_path = guess_jar_dir('SortSam.jar', java_conf) picard_jar = os.path.join(picard_path, 'CreateSequenceDictionary.jar') cmd = ['java', '-jar', picard_jar, 'R=%s' % reference_fpath, 'O=%s' % dict_path] call(cmd, raise_on_error=True, add_ext_dir=False)
def create_gmap_reference(reference_dir, reference_path, reference_name, parameters=None): 'It creates the reference fpath' cmd = ['gmap_build', '-B', get_external_bin_dir(), '-D', reference_dir, '-d', reference_name] if parameters and 'kmer' in parameters: cmd.extend(['-k', str(parameters['kmer'])]) cmd.append(reference_path) call(cmd,raise_on_error=True) fpath = '%s.coords' % reference_name if os.path.exists(fpath): os.remove(fpath)
def run(self): '''It runs the analysis.''' self._log({'analysis_started':True}) inputs = self._get_input_fpaths() bam_path = inputs['bam'] bam_fpath = bam_path.last_version reference_fpath = inputs['reference'].last_version out_fhand = open(bam_path.next_version, 'w') cmd = ['samtools', 'calmd', '-Abr', bam_fpath, reference_fpath] call(cmd, raise_on_error=True, stdout=out_fhand) create_bam_index(out_fhand.name) out_fhand.close() self._log({'analysis_finished':True})
def main(): 'the main part' # set parameters infhand, outfpath = set_parameters() # get orfs orf_fhand = NamedTemporaryFile() for index, orf in enumerate(get_orfs(infhand)): orf_fhand.write('>name_%d\n%s\n' % (index, orf)) orf_fhand.flush() print orf_fhand.name raw_input() # run codon_table_maker cmd = ['cusp', '-sequence', orf_fhand.name, '-outfile', outfpath] call(cmd)
def create_bwa_reference(reference_fpath, color=False): 'It creates the bwa index for the given reference' #how many sequences do we have? n_seqs = 0 for line in open(reference_fpath): if line[0] == '>': n_seqs += 1 if n_seqs > 10000: algorithm = 'bwtsw' else: algorithm = 'is' cmd = ['bwa', 'index', '-a', algorithm, reference_fpath] if color: cmd.append('-c') call(cmd, raise_on_error=True)
def iprscan_run(in_fasta_fpath, out_fpath): """ It runs iprscan and returns the result in a variable. It assumes that the iprscan binary is correctly instaled""" iprscan_bin = "iprscan" cmd = [iprscan_bin, "-cli", "-i", in_fasta_fpath, "-o", out_fpath, "-goterm", "-iprlookup", "-format", "xml"] # pylint: disable-msg=W0612 stdout, stderr, retcode = call(cmd) if retcode: raise RuntimeError(stderr)
def realign_bam(bam_fpath, reference_fpath, out_bam_fpath, java_conf=None, threads=False, tmp_dir=None): 'It realigns the bam using GATK Local realignment around indels' #reference sam index create_sam_reference_index(reference_fpath) #reference picard dict create_picard_dict(reference_fpath, java_conf=java_conf) #bam index create_bam_index(bam_fpath) #the intervals to realign gatk_path = guess_jar_dir('GenomeAnalysisTK.jar', java_conf) gatk_jar = os.path.join(gatk_path, 'GenomeAnalysisTK.jar') intervals_fhand = tempfile.NamedTemporaryFile(suffix='.intervals') cmd = java_cmd(java_conf=java_conf) cmd.extend(['-jar', gatk_jar, '-T', 'RealignerTargetCreator', '-I', bam_fpath, '-R', reference_fpath, '-o', intervals_fhand.name]) #according to GATK this is experimental, so it might be a good idea to #do it in just one thread. In version 1.0.4498. This options is removed # so parallel = False parallel = False if parallel and threads and threads > 1: cmd.extend(['-nt', str(get_num_threads(threads))]) call(cmd, raise_on_error=True, add_ext_dir=False) #the realignment itself unsorted_bam = NamedTemporaryFile(suffix='.bam') cmd = java_cmd(java_conf=java_conf) cmd.extend(['-Djava.io.tmpdir=%s' % tempfile.gettempdir(), '-jar', gatk_jar, '-I', bam_fpath, '-R', reference_fpath, '-T', 'IndelRealigner', '-targetIntervals', intervals_fhand.name, '-o', unsorted_bam.name]) if parallel and threads and threads > 1: cmd.extend(['-nt', str(get_num_threads(threads))]) call(cmd, raise_on_error=True, add_ext_dir=False) # now we have to realign the bam sort_bam_sam(unsorted_bam.name, out_bam_fpath, java_conf=java_conf, tmp_dir=tmp_dir)
def infer_introns_for_cdna(sequence, genomic_db, genomic_seqs_index=None, similar_sequence=None): "It infers the intron location in the cdna using est2genome" if not similar_sequence: # first we want to know where is the most similar seq in the genomic_db # this will speed up things similar_seqs = look_for_similar_sequences(sequence, database=genomic_db, blast_program="blastn") if not similar_seqs: return [] similar_seq = similar_seqs[0] else: similar_seq = similar_sequence start = similar_seq["subject_start"] end = similar_seq["subject_end"] try: similar_seq = genomic_seqs_index[similar_seq["name"]] except KeyError: msg = "Sequence %s was not found" % similar_seq["name"] raise KeyError(msg) # now we run est2genome for this cdna cdna_file = temp_fasta_file(seqs=[sequence]) similar_seq_file = temp_fasta_file(seqs=[similar_seq]) # we run est2genome cmd = [ "est2genome", cdna_file.name, similar_seq_file.name, "-sbegin2", str(start), "-send2", str(end), "-stdout", "-auto", ] stdout, stderr, retcode = call(cmd) if retcode: msg = "There was an error running est2genome: " + stderr raise RuntimeError(msg) # parse est2genome result = est2genome_parser(stdout) # get_introns_from parser_result return result["cdna"]["introns"]
def test_call(self): 'Test call' cmd = ['ls', '/@#@#@#@'] ## When fails #without stdout file. Raise False stderr = call(cmd, add_ext_dir=False)[1] assert '/@#@#@#@' in stderr try: call(cmd, raise_on_error=True, add_ext_dir=False) self.fail() except RuntimeError as error: assert '/@#@#@#@' in str(error) #with stdout file stdout = tempfile.NamedTemporaryFile() stderr = tempfile.NamedTemporaryFile() stdout_str, stderr_str = call(cmd, stdout=stdout, stderr=stderr, add_ext_dir=False)[:2] assert not stdout_str assert not stderr_str assert '/@#@#@#@' in open(stderr.name).read() try: call(cmd, stdout=stdout, stderr=stderr, raise_on_error=True, add_ext_dir=False) self.fail() except RuntimeError as error: assert '/@#@#@#@' in stderr.read() ## when it do right cmd = ['ls', '/'] #without stdout file. stdout, stderr = call(cmd, add_ext_dir=False)[:2] assert 'root'in stdout #without stdout file. stdout = tempfile.NamedTemporaryFile() stderr = tempfile.NamedTemporaryFile() stdout_str, stderr_str = call(cmd, stdout=stdout, stderr=stderr, add_ext_dir=False)[:2] assert stdout_str is None assert 'root' in stdout.read()
def map_reads_with_bwa(reference_fpath, reads_fpath, bam_fpath, parameters): 'It maps the reads to the reference using bwa and returns a bam file' colorspace = parameters['colorspace'] reads_length = parameters['reads_length'] threads = parameters['threads'] java_conf = parameters['java_conf'] tmp_dir = parameters['tmp_dir'] if 'tmp_dir' in parameters else None threads = get_num_threads(threads) #the reference should have an index bwt_fpath = reference_fpath + '.bwt' if not os.path.exists(bwt_fpath): create_bwa_reference(reference_fpath, color=colorspace) output_ali = 'output.ali' bam_file_bam = 'bam_file.bam' output_sai = 'output.sai' if reads_length == 'short': cmd = ['bwa', 'aln', reference_fpath, reads_fpath, '-t', str(threads)] if colorspace: cmd.append('-c') sai_fhand = NamedTemporaryFile(dir=tmp_dir, suffix=output_sai, mode='wb') call(cmd, stdout=sai_fhand, raise_on_error=True) cmd = ['bwa', 'samse', reference_fpath, sai_fhand.name, reads_fpath] ali_fhand = NamedTemporaryFile(dir=tmp_dir, suffix=output_ali, mode='w') call(cmd, stdout=ali_fhand, raise_on_error=True) elif reads_length == 'long': cmd = ['bwa', 'dbwtsw', reference_fpath, reads_fpath, '-t', str(threads)] ali_fhand = NamedTemporaryFile(dir=tmp_dir, suffix=output_ali) call(cmd, stdout=ali_fhand, raise_on_error=True) else: raise ValueError('Reads length: short or long') if 'unmapped_fhand' in parameters and parameters['unmapped_fhand'] is not None: out_ali_fhand = NamedTemporaryFile(dir=tmp_dir, suffix=output_ali) get_out_unmapped(ali_fhand, parameters['unmapped_fhand'], out_ali_fhand) ali_fhand = out_ali_fhand # From sam to Bam # unsorted_bam = os.path.join(temp_dir.name, bam_file_bam) unsorted_bam = NamedTemporaryFile(dir=tmp_dir, suffix=bam_file_bam) sam2bam(ali_fhand.name, unsorted_bam.name) # sort bam file sort_bam_sam(unsorted_bam.name, bam_fpath, sort_method='coordinate', java_conf=java_conf, strict_validation=False, tmp_dir=tmp_dir)
def infer_introns_for_cdna(sequence, genomic_db, genomic_seqs_index=None, similar_sequence=None): 'It infers the intron location in the cdna using est2genome' if not similar_sequence: #first we want to know where is the most similar seq in the genomic_db #this will speed up things similar_seqs = look_for_similar_sequences(sequence, database=genomic_db, blast_program='blastn') if not similar_seqs: return [] similar_seq = similar_seqs[0] else: similar_seq = similar_sequence start = similar_seq['subject_start'] end = similar_seq['subject_end'] try: similar_seq = genomic_seqs_index[similar_seq['name']] except KeyError: msg = 'Sequence %s was not found' % similar_seq['name'] raise KeyError(msg) #now we run est2genome for this cdna cdna_file = temp_fasta_file(seqs=[sequence]) similar_seq_file = temp_fasta_file(seqs=[similar_seq]) #we run est2genome cmd = ['est2genome', cdna_file.name, similar_seq_file.name, '-sbegin2', str(start), '-send2', str(end), '-stdout', '-auto'] stdout, stderr, retcode = call(cmd) if retcode: msg = 'There was an error running est2genome: ' + stderr raise RuntimeError(msg) #parse est2genome result = est2genome_parser(stdout) #get_introns_from parser_result return result['cdna']['introns']
def create_gmap_reference_old(reference_fpath): 'It creates the reference fpath' dir_, name = os.path.split(reference_fpath) if not dir_: dir_ = '.' makefile_fpath = os.path.join(dir_, 'Makefile.%s' % name) #the gmap_setup command would not accept a file, to avoid thread conflicts #we first create a name with a random name and them we move it where it #belongs temp_makefile = NamedTemporaryFile(delete=False) cmd = ['gmap_setup', '-d', name, '-D', dir_, '-o', temp_makefile.name, reference_fpath] try: call(cmd, raise_on_error=True) except OSError: raise OSError('Gmap mapper is not installed or not in the path') shutil.move(temp_makefile.name, makefile_fpath) _modify_gmap_makefile(makefile_fpath) cmd = ['make', '-f', makefile_fpath , 'coords'] call(cmd, raise_on_error=True, add_ext_dir=False) cmd = ['make', '-f', makefile_fpath, 'gmapdb'] call(cmd, raise_on_error=True, add_ext_dir=False) cmd = ['make', '-f', makefile_fpath, 'install'] call(cmd, raise_on_error=True, add_ext_dir=False) #we remove the makefile and an extra file with some instructions os.remove(makefile_fpath) coords_fpath = 'coords.%s' % name install_fpath = 'INSTALL.%s' % name for fpath in (coords_fpath, install_fpath): if os.path.exists(fpath): os.remove(fpath)
def sort_bam_sam(in_fpath, out_fpath, sort_method='coordinate', java_conf=None, tmp_dir=None, strict_validation=True): 'It sorts a bam file using picard' picard_path = guess_jar_dir('SortSam.jar', java_conf) picard_sort_jar = os.path.join(picard_path, 'SortSam.jar') java_cmd_ = java_cmd(java_conf) java_cmd_.extend(['-jar', picard_sort_jar, 'INPUT=' + in_fpath, 'OUTPUT=' + out_fpath, 'SORT_ORDER=' + sort_method]) if not strict_validation: java_cmd_.append('VALIDATION_STRINGENCY=LENIENT') if tmp_dir: java_cmd_.append('TMP_DIR=%s' % tmp_dir) stdout, stderr, retcode = call(java_cmd_, raise_on_error=False, add_ext_dir=False) err_msg = 'No space left on device' if retcode and (err_msg in stdout or err_msg in stderr): raise RuntimeError('Picard sort consumed all space in device.' + stderr) elif retcode: msg = 'Error running picard: %s\n stderr: %s\n stdout: %s' % \ (' '.join(java_cmd_), stderr, stdout) raise RuntimeError(msg)
def sam2bam(sam_path, bam_path): 'It converts between bam and sam.' cmd = ['samtools', 'view', '-bSh', '-o', bam_path, sam_path] call(cmd, raise_on_error=True)
def makeblastdb_plus(seq_fpath, dbtype, outputdb=None): 'It creates the blast db database' cmd = ['makeblastdb', '-in', seq_fpath, '-dbtype', dbtype] if outputdb is not None: cmd.extend(['-out', outputdb]) call(cmd, raise_on_error=True)