def map_with_bowtie2(index_fpath, bam_fpath, paired_fpaths=None, unpaired_fpaths=None, readgroup=None, threads=None, log_fpath=None, preset='very-sensitive-local', extra_params=None): '''It maps with bowtie2. paired_seqs is a list of tuples, in which each tuple are paired seqs unpaired_seqs is a list of files ''' if readgroup is None: readgroup = {} if extra_params is None: extra_params = [] if paired_fpaths is None and unpaired_fpaths is None: raise RuntimeError('At least one file to map is required') binary = get_binary_path('bowtie2') cmd = [binary, '-x', index_fpath, '--{0}'.format(preset), '-p', str(get_num_threads(threads))] cmd.extend(extra_params) if unpaired_fpaths: cmd.extend(['-U', ','.join(unpaired_fpaths)]) if paired_fpaths: plus = [pairs[0] for pairs in paired_fpaths] minus = [pairs[1] for pairs in paired_fpaths] cmd.extend(['-1', ','.join(plus), '-2', ','.join(minus)]) if 'ID' in readgroup.keys(): for key, value in readgroup.items(): if key not in ('ID', 'LB', 'SM', 'PL'): msg = 'The readgroup header tag is not valid: {}'.format(key) raise RuntimeError(msg) if key == 'ID': cmd.extend(['--rg-id', value]) else: cmd.extend(['--rg', '{0}:{1}'.format(key, value)]) if log_fpath is None: stderr = NamedTemporaryFile(suffix='.stderr') else: stderr = open(log_fpath, 'w') # raw_input(' '.join(cmd)) bowtie2 = popen(cmd, stderr=stderr, stdout=PIPE) # print bowtie2.stdout.read() cmd = [get_binary_path('samtools'), 'view', '-h', '-b', '-S', '-', '-o', bam_fpath] samtools = popen(cmd, stdin=bowtie2.stdout, stderr=stderr) bowtie2.stdout.close() # Allow p1 to receive a SIGPIPE if samtools exits. samtools.communicate()
def map_with_bwasw(index_fpath, bam_fpath, unpaired_fpath=None, paired_fpaths=None, readgroup=None, threads=None, log_fpath=None, extra_params=None): 'It maps with bwa ws algorithm' if paired_fpaths is None and unpaired_fpath is None: raise RuntimeError('At least one file to map is required') elif paired_fpaths is not None and unpaired_fpath is not None: msg = 'Bwa can not map unpaired and unpaired reads together' raise RuntimeError(msg) if readgroup is None: readgroup = {} if extra_params is None: extra_params = [] binary = get_binary_path('bwa') cmd = [binary, 'bwasw', '-t', str(get_num_threads(threads)), index_fpath] cmd.extend(extra_params) if paired_fpaths is not None: cmd.extend(paired_fpaths) if unpaired_fpath is not None: cmd.append(unpaired_fpath) if log_fpath is None: stderr = NamedTemporaryFile(suffix='.stderr') else: stderr = open(log_fpath, 'w') #raw_input(' '.join(cmd)) bwa = popen(cmd, stderr=stderr, stdout=PIPE) # add readgroup using picard picard_tools = get_setting("PICARD_TOOLS_DIR") if readgroup: cmd = ['java', '-jar', os.path.join(picard_tools, 'AddOrReplaceReadGroups.jar'), 'INPUT=/dev/stdin', 'OUTPUT={0}'.format(bam_fpath), 'RGID={0}'.format(readgroup['ID']), 'RGLB={0}'.format(readgroup['LB']), 'RGPL={0}'.format(readgroup['PL']), 'RGSM={0}'.format(readgroup['SM']), 'RGPU={0}'.format(readgroup['PU']), 'VALIDATION_STRINGENCY=LENIENT'] else: cmd = [get_binary_path('samtools'), 'view', '-h', '-b', '-S', '-', '-o', bam_fpath] samtools = popen(cmd, stdin=bwa.stdout, stderr=stderr) bwa.stdout.close() # Allow p1 to receive a SIGPIPE if samtools exits. samtools.communicate() if bwa.returncode or samtools.returncode: raise RuntimeError(open(stderr.name).read())
def _makeblastdb_plus(seq_fpath, dbtype, outputdb=None): 'It creates the blast db database' cmd = [get_binary_path('makeblastdb'), '-in', seq_fpath, '-dbtype', dbtype] if outputdb is not None: cmd.extend(['-out', outputdb]) process = popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) check_process_finishes(process, binary=cmd[0])
def test_add_rg_to_bam(self): reference_fpath = os.path.join(TEST_DATA_DIR, 'arabidopsis_genes') reads_fpath = os.path.join(TEST_DATA_DIR, 'arabidopsis_reads.fastq') directory = TemporaryDir() index_fpath = get_or_create_bwa_index(reference_fpath, directory.name) bam_fhand = NamedTemporaryFile(suffix='.bam') lib_name = 'aa' log_fhand = NamedTemporaryFile() readgroup = { 'ID': lib_name, 'PL': 'illumina', 'LB': lib_name, 'SM': '{0}_illumina_pe'.format(lib_name), 'PU': '0' } bwa = map_with_bwamem(index_fpath, unpaired_fpath=reads_fpath, readgroup=readgroup, log_fpath=log_fhand.name) map_process_to_bam(bwa, bam_fhand.name) out = subprocess.check_output( [get_binary_path('samtools'), 'view', '-h', bam_fhand.name], stderr=log_fhand) assert '@RG\tID:aa' in out assert 'TTCTGATTCAATCTACTTCAAAGTTGGCTTTATCAATAAG' in out directory.close()
def map_with_bwamem(index_fpath, unpaired_fpath=None, paired_fpaths=None, threads=None, log_fpath=None, extra_params=None): 'It maps with bwa ws algorithm' if paired_fpaths is None and unpaired_fpath is None: raise RuntimeError('At least one file to map is required') elif paired_fpaths is not None and unpaired_fpath is not None: msg = 'Bwa can not map unpaired and unpaired reads together' raise RuntimeError(msg) if extra_params is None: extra_params = [] binary = get_binary_path('bwa') cmd = [binary, 'mem', '-t', str(get_num_threads(threads)), index_fpath] cmd.extend(extra_params) if paired_fpaths is not None: cmd.extend(paired_fpaths) if unpaired_fpath is not None: cmd.append(unpaired_fpath) if log_fpath is None: stderr = NamedTemporaryFile(suffix='.stderr') else: stderr = open(log_fpath, 'w') #raw_input(' '.join(cmd)) bwa = popen(cmd, stderr=stderr, stdout=PIPE) return bwa
def _makeblastdb_plus(seq_fpath, dbtype, outputdb=None): "It creates the blast db database" cmd = [get_binary_path("makeblastdb"), "-in", seq_fpath, "-dbtype", dbtype] if outputdb is not None: cmd.extend(["-out", outputdb]) process = popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) check_process_finishes(process, binary=cmd[0])
def _create_bwa_index(index_fpath): binary = get_binary_path('bwa') # how many sequences do we have? n_seqs = [l for l in open(index_fpath) if l[0] == '>'] algorithm = 'bwtsw' if n_seqs > 10000 else 'is' cmd = [binary, 'index', '-a', algorithm, index_fpath] process = popen(cmd, stdout=PIPE, stderr=PIPE) check_process_finishes(process, binary=cmd[0])
def get_genome_coverage(bam_fhands): coverage_hist = IntCounter() for bam_fhand in bam_fhands: bam_fpath = bam_fhand.name cmd = [get_binary_path('bedtools'), 'genomecov', '-ibam', bam_fpath] cover_process = Popen(cmd, stdout=PIPE) for line in cover_process.stdout: if line.startswith('genome'): cov, value = line.split('\t')[1:3] coverage_hist[int(cov)] += int(value) return coverage_hist
def get_genome_coverage(bam_fhands): coverage_hist = IntCounter() for bam_fhand in bam_fhands: bam_fpath = bam_fhand.name cmd = [get_binary_path('bedtools'), 'genomecov', '-ibam', bam_fpath] cover_process = Popen(cmd, stdout=PIPE) for line in cover_process.stdout: if line.startswith('genome'): cov, value = line.split('\t')[1: 3] coverage_hist[int(cov)] += int(value) return coverage_hist
def _run_estscan(seqs, pep_out_fpath, dna_out_fpath, matrix_fpath): 'It runs estscan in the input seqs' seq_fhand = write_seqs(seqs, file_format='fasta') seq_fhand.flush() binary = get_binary_path('estscan') cmd = [binary, '-t', pep_out_fpath, '-o', dna_out_fpath, '-M', matrix_fpath, seq_fhand.name] process = popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) check_process_finishes(process, binary=cmd[0]) seq_fhand.close()
def get_or_create_bowtie2_index(fpath, directory=None): "it creates the bowtie2 index" binary = get_binary_path('bowtie2-build') if directory is not None: index_fpath = os.path.join(directory, os.path.basename(fpath)) else: index_fpath = fpath if not _bowtie2_index_exists(index_fpath): cmd = [binary, '-f', fpath, index_fpath] process = popen(cmd, stdout=PIPE, stderr=PIPE) check_process_finishes(process, binary=cmd[0]) return index_fpath
def map_process_to_bam(map_process, bam_fpath, log_fpath=None): if log_fpath is None: stderr = NamedTemporaryFile(suffix='.stderr') else: stderr = open(log_fpath, 'w') cmd = [get_binary_path('samtools'), 'view', '-h', '-b', '-S', '-', '-o', bam_fpath] samtools = popen(cmd, stdin=map_process.stdout, stderr=stderr) map_process.stdout.close() # Allow p1 to receive a SIGPIPE if samtools exits. samtools.communicate()
def test_map_with_bwa(self): reference_fpath = os.path.join(TEST_DATA_DIR, 'arabidopsis_genes') reads_fpath = os.path.join(TEST_DATA_DIR, 'arabidopsis_reads.fastq') directory = TemporaryDir() index_fpath = get_or_create_bwa_index(reference_fpath, directory.name) bam_fhand = NamedTemporaryFile(suffix='.bam') map_with_bwasw(index_fpath, bam_fhand.name, unpaired_fpath=reads_fpath) out = subprocess.check_output([get_binary_path('samtools'), 'view', bam_fhand.name]) assert 'TTCTGATTCAATCTACTTCAAAGTTGGCTTTATCAATAAG' in out directory.close()
def test_map_with_bwa(self): reference_fpath = os.path.join(TEST_DATA_DIR, 'arabidopsis_genes') reads_fpath = os.path.join(TEST_DATA_DIR, 'arabidopsis_reads.fastq') directory = TemporaryDir() index_fpath = get_or_create_bwa_index(reference_fpath, directory.name) bam_fhand = NamedTemporaryFile(suffix='.bam') map_with_bwasw(index_fpath, bam_fhand.name, unpaired_fpath=reads_fpath) out = subprocess.check_output( [get_binary_path('samtools'), 'view', bam_fhand.name]) assert 'TTCTGATTCAATCTACTTCAAAGTTGGCTTTATCAATAAG' in out directory.close()
def map_with_bwamem(index_fpath, unpaired_fpath=None, paired_fpaths=None, interleave_fpath=None, threads=None, log_fpath=None, extra_params=None, readgroup=None): 'It maps with bwa mem algorithm' interleave = False num_called_fpaths = 0 in_fpaths = [] if unpaired_fpath is not None: num_called_fpaths += 1 in_fpaths.append(unpaired_fpath) if paired_fpaths is not None: num_called_fpaths += 1 in_fpaths.extend(paired_fpaths) if interleave_fpath is not None: num_called_fpaths += 1 in_fpaths.append(interleave_fpath) interleave = True if num_called_fpaths == 0: raise RuntimeError('At least one file to map is required') if num_called_fpaths > 1: msg = 'Bwa can not map unpaired and unpaired reads together' raise RuntimeError(msg) if extra_params is None: extra_params = [] if '-p' in extra_params: extra_params.remove('-p') if interleave: extra_params.append('-p') if readgroup is not None: rg_str = '@RG\tID:{ID}\tSM:{SM}\tPL:{PL}\tLB:{LB}'.format(**readgroup) extra_params.extend(['-R', rg_str]) binary = get_binary_path('bwa') cmd = [binary, 'mem', '-t', str(get_num_threads(threads)), index_fpath] cmd.extend(extra_params) cmd.extend(in_fpaths) if log_fpath is None: stderr = NamedTemporaryFile(suffix='.stderr') else: stderr = open(log_fpath, 'w') #raw_input(' '.join(cmd)) bwa = popen(cmd, stderr=stderr, stdout=PIPE) return bwa
def map_process_to_bam(map_process, bam_fpath, log_fpath=None, tempdir=None): ''' It receives a mapping process that has a sam file in stdout and calling another external process convert the sam file into a bam file. Optionally you can fill the readgroup field ''' if log_fpath is None: stderr = NamedTemporaryFile(suffix='.stderr') else: stderr = open(log_fpath, 'w') cmd = [get_binary_path('samtools'), 'view', '-h', '-b', '-S', '-', '-o', bam_fpath] samtools = popen(cmd, stdin=map_process.stdout, stderr=stderr) map_process.stdout.close() # Allow p1 to receive a SIGPIPE if samtools exits. samtools.communicate()
def get_reference_counts(bam_fpath): 'Using samtools idxstats it generates dictionaries with read counts' cmd = [get_binary_path('samtools'), 'idxstats', bam_fpath] idx_process = Popen(cmd, stdout=PIPE) # we're not using pysam.idxstats here because the stdout differed # depending on how the tests were run for line in idx_process.stdout: ref_name, ref_length, mapped_reads, unmapped_reads = line.split() if ref_name == '*': ref_name = None ref_length = None else: ref_length = int(ref_length) yield {'reference': ref_name, 'length': ref_length, 'mapped_reads': int(mapped_reads), 'unmapped_reads': int(unmapped_reads)}
def map_with_bowtie2(index_fpath, paired_fpaths=None, unpaired_fpath=None, readgroup=None, threads=None, log_fpath=None, preset='very-sensitive-local', extra_params=None): '''It maps with bowtie2. paired_seqs is a list of tuples, in which each tuple are paired seqs unpaired_seqs is a list of files ''' if readgroup is None: readgroup = {} if extra_params is None: extra_params = [] if paired_fpaths is None and unpaired_fpath is None: raise RuntimeError('At least one file to map is required') binary = get_binary_path('bowtie2') cmd = [binary, '-x', index_fpath, '--{0}'.format(preset), '-p', str(get_num_threads(threads))] cmd.extend(extra_params) if unpaired_fpath: cmd.extend(['-U', unpaired_fpath]) if paired_fpaths: cmd.extend(['-1', paired_fpaths[0], '-2', paired_fpaths[1]]) if 'ID' in readgroup.keys(): for key, value in readgroup.items(): if key not in ('ID', 'LB', 'SM', 'PL'): msg = 'The readgroup header tag is not valid: {}'.format(key) raise RuntimeError(msg) if key == 'ID': cmd.extend(['--rg-id', value]) else: cmd.extend(['--rg', '{0}:{1}'.format(key, value)]) if log_fpath is None: stderr = NamedTemporaryFile(suffix='.stderr') else: stderr = open(log_fpath, 'w') bowtie2 = popen(cmd, stderr=stderr, stdout=PIPE) # print bowtie2.stdout.read() return bowtie2
def do_blast(query_fpath, db_fpath, program, out_fpath, params=None): "It does a blast" if not params: params = {} evalue = params.get("evalue", 0.001) task = params.get("task", "megablast") outfmt = str(params.get("outfmt", 5)) assert task in ("blastn", "blastn-short", "dc-megablast", "megablast", "rmblastn") if program not in ("blastn", "blastp", "blastx", "tblastx", "tblastn"): raise ValueError("The given program is invalid: " + str(program)) binary = get_binary_path(program) cmd = [binary, "-query", query_fpath, "-db", db_fpath, "-out", out_fpath] cmd.extend(["-evalue", str(evalue), "-task", task]) cmd.extend(["-outfmt", outfmt]) process = popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) check_process_finishes(process, binary=cmd[0])
def test_add_rg_to_bam(self): reference_fpath = os.path.join(TEST_DATA_DIR, 'arabidopsis_genes') reads_fpath = os.path.join(TEST_DATA_DIR, 'arabidopsis_reads.fastq') directory = TemporaryDir() index_fpath = get_or_create_bwa_index(reference_fpath, directory.name) bam_fhand = NamedTemporaryFile(suffix='.bam') lib_name = 'aa' log_fhand = NamedTemporaryFile() readgroup = {'ID': lib_name, 'PL': 'illumina', 'LB': lib_name, 'SM': '{0}_illumina_pe'.format(lib_name), 'PU': '0'} bwa = map_with_bwamem(index_fpath, unpaired_fpath=reads_fpath, readgroup=readgroup, log_fpath=log_fhand.name) map_process_to_bam(bwa, bam_fhand.name) out = subprocess.check_output([get_binary_path('samtools'), 'view', '-h', bam_fhand.name], stderr=log_fhand) assert '@RG\tID:aa' in out assert 'TTCTGATTCAATCTACTTCAAAGTTGGCTTTATCAATAAG' in out directory.close()
def map_with_hisat2(index_fpath, paired_fpaths=None, unpaired_fpath=None, readgroup=None, threads=None, log_fhand=None, extra_params=None): '''It maps with hisat2. paired_seqs is a list of tuples, in which each tuple are paired seqs unpaired_seqs is a list of files ''' if readgroup is None: readgroup = {} if extra_params is None: extra_params = [] if paired_fpaths is None and unpaired_fpath is None: raise RuntimeError('At least one file to map is required') binary = get_binary_path('hisat2') cmd = [binary, '-x', index_fpath, '-p', str(get_num_threads(threads))] cmd.extend(extra_params) if unpaired_fpath: cmd.extend(['-U', unpaired_fpath]) if paired_fpaths: cmd.extend(['-1', paired_fpaths[0], '-2', paired_fpaths[1]]) if 'ID' in readgroup.keys(): for key, value in readgroup.items(): if key not in ('ID', 'LB', 'SM', 'PL'): msg = 'The readgroup header tag is not valid: {}'.format(key) raise RuntimeError(msg) if key == 'ID': cmd.extend(['--rg-id', value]) else: cmd.extend(['--rg', '{0}:{1}'.format(key, value)]) hisat2 = popen(cmd, stderr=log_fhand, stdout=PIPE) return hisat2
def _do_blast_local(query_fpath, db_fpath, program, out_fpath, params=None): "It does a blast" if not params: params = {} evalue, task = _parse_blast_params(params, program) if "outfmt" in params: outfmt = params["outfmt"] del params["outfmt"] else: outfmt = 5 if program not in ("blastn", "blastp", "blastx", "tblastx", "tblastn"): raise ValueError("The given program is invalid: " + str(program)) binary = get_binary_path(program) cmd = [binary, "-query", query_fpath, "-db", db_fpath, "-out", out_fpath] cmd.extend(["-evalue", str(evalue), "-outfmt", str(outfmt)]) if task: cmd.extend(["-task", task]) if params: for key, value in params.viewitems(): cmd.extend(("-" + key, str(value))) process = popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) check_process_finishes(process, binary=cmd[0])
def _do_blast_local(query_fpath, db_fpath, program, out_fpath, params=None): 'It does a blast' if not params: params = {} evalue, task = _parse_blast_params(params, program) if 'outfmt' in params: outfmt = params['outfmt'] del params['outfmt'] else: outfmt = 5 if program not in ('blastn', 'blastp', 'blastx', 'tblastx', 'tblastn'): raise ValueError('The given program is invalid: ' + str(program)) binary = get_binary_path(program) cmd = [binary, '-query', query_fpath, '-db', db_fpath, '-out', out_fpath] cmd.extend(['-evalue', str(evalue), '-outfmt', str(outfmt)]) if task: cmd.extend(['-task', task]) if params: for key, value in params.viewitems(): cmd.extend(('-' + key, str(value))) process = popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) check_process_finishes(process, binary=cmd[0])
def map_with_bowtie2(index_fpath, bam_fpath, paired_fpaths=None, unpaired_fpaths=None, readgroup=None, threads=None, log_fpath=None, preset='very-sensitive-local', extra_params=None): '''It maps with bowtie2. paired_seqs is a list of tuples, in which each tuple are paired seqs unpaired_seqs is a list of files ''' if readgroup is None: readgroup = {} if extra_params is None: extra_params = [] if paired_fpaths is None and unpaired_fpaths is None: raise RuntimeError('At least one file to map is required') binary = get_binary_path('bowtie2') cmd = [ binary, '-x', index_fpath, '--{0}'.format(preset), '-p', str(get_num_threads(threads)) ] cmd.extend(extra_params) if unpaired_fpaths: cmd.extend(['-U', ','.join(unpaired_fpaths)]) if paired_fpaths: plus = [pairs[0] for pairs in paired_fpaths] minus = [pairs[1] for pairs in paired_fpaths] cmd.extend(['-1', ','.join(plus), '-2', ','.join(minus)]) if 'ID' in readgroup.keys(): for key, value in readgroup.items(): if key not in ('ID', 'LB', 'SM', 'PL'): msg = 'The readgroup header tag is not valid: {}'.format(key) raise RuntimeError(msg) if key == 'ID': cmd.extend(['--rg-id', value]) else: cmd.extend(['--rg', '{0}:{1}'.format(key, value)]) if log_fpath is None: stderr = NamedTemporaryFile(suffix='.stderr') else: stderr = open(log_fpath, 'w') # raw_input(' '.join(cmd)) bowtie2 = popen(cmd, stderr=stderr, stdout=PIPE) # print bowtie2.stdout.read() cmd = [ get_binary_path('samtools'), 'view', '-h', '-b', '-S', '-', '-o', bam_fpath ] samtools = popen(cmd, stdin=bowtie2.stdout, stderr=stderr) bowtie2.stdout.close() # Allow p1 to receive a SIGPIPE if samtools exits. samtools.communicate()
def map_with_bwasw(index_fpath, bam_fpath, unpaired_fpath=None, paired_fpaths=None, readgroup=None, threads=None, log_fpath=None, extra_params=None): 'It maps with bwa ws algorithm' if paired_fpaths is None and unpaired_fpath is None: raise RuntimeError('At least one file to map is required') elif paired_fpaths is not None and unpaired_fpath is not None: msg = 'Bwa can not map unpaired and unpaired reads together' raise RuntimeError(msg) if readgroup is None: readgroup = {} if extra_params is None: extra_params = [] binary = get_binary_path('bwa') cmd = [binary, 'bwasw', '-t', str(get_num_threads(threads)), index_fpath] cmd.extend(extra_params) if paired_fpaths is not None: cmd.extend(paired_fpaths) if unpaired_fpath is not None: cmd.append(unpaired_fpath) if log_fpath is None: stderr = NamedTemporaryFile(suffix='.stderr') else: stderr = open(log_fpath, 'w') #raw_input(' '.join(cmd)) bwa = popen(cmd, stderr=stderr, stdout=PIPE) # add readgroup using picard picard_tools = get_setting("PICARD_TOOLS_DIR") if readgroup: cmd = [ 'java', '-jar', os.path.join(picard_tools, 'AddOrReplaceReadGroups.jar'), 'INPUT=/dev/stdin', 'OUTPUT={0}'.format(bam_fpath), 'RGID={0}'.format(readgroup['ID']), 'RGLB={0}'.format(readgroup['LB']), 'RGPL={0}'.format(readgroup['PL']), 'RGSM={0}'.format(readgroup['SM']), 'RGPU={0}'.format(readgroup['PU']), 'VALIDATION_STRINGENCY=LENIENT' ] else: cmd = [ get_binary_path('samtools'), 'view', '-h', '-b', '-S', '-', '-o', bam_fpath ] samtools = popen(cmd, stdin=bwa.stdout, stderr=stderr) bwa.stdout.close() # Allow p1 to receive a SIGPIPE if samtools exits. samtools.communicate() if bwa.returncode or samtools.returncode: raise RuntimeError(open(stderr.name).read())
def index_bam(bam_fpath): 'It indexes a bam file' samtools_binary = get_binary_path('samtools') subprocess.check_call([samtools_binary, 'index', bam_fpath])