def _create_tabix(fname, ftype): logger = logging.getLogger("pita") tabix_file = "" logger.info("Creating tabix index for %s", os.path.basename(fname)) logger.debug("Preparing %s for tabix", fname) tmp = NamedTemporaryFile(prefix="pita", delete=False) preset = "gff" if ftype == "bed": cmd = "sort -k1,1 -k2g,2 {0} | grep -v track | grep -v \"^#\" > {1}" preset = "bed" elif ftype in ["gff", "gff3", "gtf"]: cmd = "sort -k1,1 -k4g,4 {0} | grep -v \"^#\" > {1}" # Sort the input file logger.debug(cmd.format(fname, tmp.name)) sp.call(cmd.format(fname, tmp.name), shell=True) # Compress using bgzip logger.debug("compressing %s", tmp.name) tabix_file = tmp.name + ".gz" pysam.tabix_compress(tmp.name, tabix_file) tmp.close() # Index (using tabix command line, as pysam.index results in a Segmentation fault logger.debug("indexing %s", tabix_file) sp.call("tabix {0} -p {1}".format(tabix_file, preset), shell=True) return tabix_file
def annotate_vcf(self, inVcf, genome, outVcf, JVMmemory=None): """ Annotate variants in VCF file with translation consequences using snpEff. """ if outVcf.endswith('.vcf.gz'): tmpVcf = util.file.mkstempfname(prefix='vcf_snpEff-', suffix='.vcf') elif outVcf.endswith('.vcf'): tmpVcf = outVcf else: raise Exception("invalid input") args = [ '-treatAllAsProteinCoding', 'false', '-t', '-noLog', '-ud', '0', '-noStats', '-noShiftHgvs', genome, inVcf ] with open(tmpVcf, 'wt') as outf: self.execute('ann', args, JVMmemory=JVMmemory, stdout=outf) if outVcf.endswith('.vcf.gz'): pysam.tabix_compress(tmpVcf, outVcf, force=True) pysam.tabix_index(outVcf, force=True, preset='vcf') os.unlink(tmpVcf)
def main(): # Read options, args. parser = optparse.OptionParser() parser.add_option('-c', '--chr-col', type='int', dest='chrom_col') parser.add_option('-s', '--start-col', type='int', dest='start_col') parser.add_option('-e', '--end-col', type='int', dest='end_col') parser.add_option('-P', '--preset', dest='preset') (options, args) = parser.parse_args() input_fname, output_fname = args tmpfile = tempfile.NamedTemporaryFile() sort_params = None if options.chrom_col and options.start_col and options.end_col: sort_params = [ "sort", "-k%(i)s,%(i)s" % {'i': options.chrom_col}, "-k%(i)i,%(i)in" % {'i': options.start_col}, "-k%(i)i,%(i)in" % {'i': options.end_col} ] elif options.preset == "bed": sort_params = ["sort", "-k1,1", "-k2,2n", "-k3,3n"] elif options.preset == "vcf": sort_params = ["sort", "-k1,1", "-k2,2n"] elif options.preset == "gff": sort_params = ["sort", "-s", "-k1,1", "-k4,4n"] # stable sort on start column # Skip any lines starting with "#" and "track" grepped = subprocess.Popen(["grep", "-e", "^\"#\"", "-e", "^track", "-v", input_fname], stderr=subprocess.PIPE, stdout=subprocess.PIPE) after_sort = subprocess.Popen(sort_params, stdin=grepped.stdout, stderr=subprocess.PIPE, stdout=tmpfile) grepped.stdout.close() output, err = after_sort.communicate() pysam.tabix_compress(tmpfile.name, output_fname, force=True)
def main(): # Read options, args. parser = optparse.OptionParser() (options, args) = parser.parse_args() input_fname, output_fname = args pysam.tabix_compress(input_fname, output_fname, force=True)
def testIndexPresetCompressed(self): '''test indexing via preset.''' pysam.tabix_compress(self.tmpfilename, self.tmpfilename + ".gz") pysam.tabix_index(self.tmpfilename + ".gz", preset=self.preset) checkBinaryEqual(self.tmpfilename + ".gz", self.filename) checkBinaryEqual(self.tmpfilename + ".gz.tbi", self.filename_idx)
def make_bias_track(args, bases = 500000, splitsize = 1000): """function to compute bias track """ if args.out is None: if args.bed is not None: args.out = '.'.join(os.path.basename(args.bed).split('.')[0:-1]) else: args.out = '.'.join(os.path.basename(args.fasta).split('.')[0:-1]) params = _BiasParams(args.fasta, args.pwm) if args.bed is None: chunks = ChunkList.convertChromSizes(params.chrs, splitsize = splitsize) sets = chunks.split(items = bases/splitsize) else: chunks = ChunkList.read(args.bed) chunks.merge() sets = chunks.split(bases = bases) maxQueueSize = max(2,int(2 * bases / np.mean([chunk.length() for chunk in chunks]))) pool = mp.Pool(processes = max(1,args.cores-1)) out_handle = open(args.out + '.Scores.bedgraph','w') out_handle.close() write_queue = mp.JoinableQueue(maxsize = maxQueueSize) write_process = mp.Process(target = _writeBias, args=(write_queue, args.out)) write_process.start() for j in sets: tmp = pool.map(_biasHelper, zip(j,itertools.repeat(params))) for track in tmp: write_queue.put(track) pool.close() pool.join() write_queue.put('STOP') write_process.join() pysam.tabix_compress(args.out + '.Scores.bedgraph', args.out + '.Scores.bedgraph.gz', force = True) shell_command('rm ' + args.out + '.Scores.bedgraph') pysam.tabix_index(args.out + '.Scores.bedgraph.gz', preset = "bed", force = True)
def ensureIndexed(bedPath, preset="bed", trySorting=True): if not bedPath.endswith(".gz"): if not os.path.exists(bedPath + ".gz"): logging.info("bgzf compressing {}".format(bedPath)) pysam.tabix_compress(bedPath, bedPath + ".gz") if not os.path.exists(bedPath + ".gz"): raise Exception( "Failed to create compress {preset} file for {file}; make sure the {preset} file is " "sorted and the directory is writeable".format(preset=preset, file=bedPath) ) bedPath += ".gz" if not os.path.exists(bedPath + ".tbi"): logging.info("creating tabix index for {}".format(bedPath)) pysam.tabix_index(bedPath, preset=preset) if not os.path.exists(bedPath + ".tbi"): raise Exception( "Failed to create tabix index file for {file}; make sure the {preset} file is " "sorted and the directory is writeable".format(preset=preset, file=bedPath) ) line = pysam.Tabixfile(bedPath).fetch().next() if len(line.strip().split("\t")) < 6 and preset == "bed": raise AnnotationError( "BED files need to have at least 6 (tab-delimited) fields (including " "chrom, start, end, name, score, strand; score is unused)" ) if len(line.strip().split("\t")) < 9 and preset == "gff": raise AnnotationError("GFF/GTF files need to have at least 9 tab-delimited fields") return bedPath
def get_cov(args, bases = 50000, splitsize = 1000): """function to get coverages """ if not args.out: if args.bed is None: args.out = '.'.join(os.path.basename(args.bam).split('.')[0:-1]) else: args.out = '.'.join(os.path.basename(args.bed).split('.')[0:-1]) if args.bed is None: chrs = read_chrom_sizes_from_bam(args.bam) chunks = ChunkList.convertChromSizes(chrs, splitsize = splitsize) sets = chunks.split(items = bases/splitsize) else: chunks = ChunkList.read(args.bed) chunks.merge() sets = chunks.split(bases = bases) maxQueueSize = max(2,int(2 * bases / np.mean([chunk.length() for chunk in chunks]))) pool1 = mp.Pool(processes = max(1,args.cores-1)) out_handle = open(args.out + '.cov.bedgraph','w') out_handle.close() write_queue = mp.JoinableQueue(maxsize = maxQueueSize) write_process = mp.Process(target = _writeCov, args=(write_queue, args.out)) write_process.start() for j in sets: tmp = pool1.map(_covHelper, zip(j,itertools.repeat(args))) for track in tmp: write_queue.put(track) pool1.close() pool1.join() write_queue.put('STOP') write_process.join() pysam.tabix_compress(args.out + '.cov.bedgraph', args.out + '.cov.bedgraph.gz', force = True) shell_command('rm ' + args.out + '.cov.bedgraph') pysam.tabix_index(args.out + '.cov.bedgraph.gz', preset = "bed", force = True)
def eff_vcf(self, inVcf, outVcf, genome, java_flags='-Xmx2g', in_format='vcf', out_format='vcf', eff_options=''): """ TODO: docstring here """ if outVcf.endswith('.vcf.gz'): tmpVcf = util.file.mkstempfname(prefix='vcf_snpEff-', suffix='.vcf') else: tmpVcf = outVcf args = ' '.join([ 'eff', '-c', '{}/snpEff.config'.format(self.executable_path()), '-i', in_format, '-o', out_format, genome, '-treatAllAsProteinCoding false', '-noLog', '-ud 0', '-noStats', eff_options ]) if inVcf.endswith('.gz'): pre_pipe = "zcat {} | ".format(inVcf) else: pre_pipe = "cat {} | ".format(inVcf) post_pipe = " > {}".format(tmpVcf) self.execute(args, java_flags=java_flags, pre_pipe=pre_pipe, post_pipe=post_pipe) if outVcf.endswith('.vcf.gz'): pysam.tabix_compress(tmpVcf, outVcf, force=True) pysam.tabix_index(outVcf, force=True, preset='vcf') os.unlink(tmpVcf)
def annotate_vcf(self, inVcf, genomes, outVcf, emailAddress, JVMmemory=None): """ Annotate variants in VCF file with translation consequences using snpEff. """ if outVcf.endswith('.vcf.gz'): tmpVcf = util.file.mkstempfname(prefix='vcf_snpEff-', suffix='.vcf') elif outVcf.endswith('.vcf'): tmpVcf = outVcf else: raise Exception("invalid input") sortedAccessionString = ", ".join(sorted(genomes)) databaseId = hashlib.sha256(sortedAccessionString.encode('utf-8')).hexdigest()[:55] genomeToUse = "" # if we don't have the genome, by name (snpEff official) or by hash (custom) if (not self.has_genome(databaseId)): if (not self.has_genome(genomes[0])): _log.info("Checking for snpEff database online...") # check to see if it is available for download, and if so install it for row in self.available_databases(): if (genomes[0].lower() in row['Genome'].lower()) or ( genomes[0].lower() in row['Bundle'].lower() ) or ( genomes[0].lower() in row['Organism'].lower() ): self.download_db(row['Genome']) # backward compatability for where a single genome name is provided if self.has_genome(genomes[0]): genomeToUse = genomes[0] else: # if the hash of the accessions passed in is not present in the genomes db if not self.has_genome(databaseId): self.create_db(genomes, emailAddress, JVMmemory) if self.has_genome(databaseId): genomeToUse = databaseId if not genomeToUse: raise Exception() args = [ '-treatAllAsProteinCoding', 'false', '-t', '-noLog', '-ud', '0', '-noStats', '-noShiftHgvs', genomeToUse, os.path.realpath(inVcf) ] command_ps = self.execute('ann', args, JVMmemory=JVMmemory) if command_ps.returncode == 0: with open(tmpVcf, 'wt') as outf: outf.write(command_ps.stdout.decode("utf-8")) if outVcf.endswith('.vcf.gz'): pysam.tabix_compress(tmpVcf, outVcf, force=True) pysam.tabix_index(outVcf, force=True, preset='vcf') os.unlink(tmpVcf) else: raise subprocess.CalledProcessError(cmd=command_ps.args, returncode=command_ps.returncode, output=command_ps.stdout)
def annotate_vcf(self, inVcf, genomes, outVcf, emailAddress, JVMmemory=None): """ Annotate variants in VCF file with translation consequences using snpEff. """ if outVcf.endswith('.vcf.gz'): tmpVcf = util.file.mkstempfname(prefix='vcf_snpEff-', suffix='.vcf') elif outVcf.endswith('.vcf'): tmpVcf = outVcf else: raise Exception("invalid input") sortedAccessionString = ", ".join([util.genbank.parse_accession_str(acc) for acc in sorted(genomes)]) databaseId = hashlib.sha256(sortedAccessionString.encode('utf-8')).hexdigest()[:55] genomeToUse = "" # if we don't have the genome, by name (snpEff official) or by hash (custom) if (not self.has_genome(databaseId)): if (not self.has_genome(genomes[0])): _log.info("Checking for snpEff database online...") # check to see if it is available for download, and if so install it for row in self.available_databases(): if (genomes[0].lower() in row['Genome'].lower()) or ( genomes[0].lower() in row['Bundle'].lower() ) or ( genomes[0].lower() in row['Organism'].lower() ): self.download_db(row['Genome']) # backward compatability for where a single genome name is provided if self.has_genome(genomes[0]): genomeToUse = genomes[0] else: # if the hash of the accessions passed in is not present in the genomes db if not self.has_genome(databaseId): self.create_db(genomes, emailAddress, JVMmemory) if self.has_genome(databaseId): genomeToUse = databaseId if not genomeToUse: raise Exception() args = [ '-treatAllAsProteinCoding', 'false', '-t', '-noLog', '-ud', '0', '-noStats', '-noShiftHgvs', genomeToUse, os.path.realpath(inVcf) ] command_ps = self.execute('ann', args, JVMmemory=JVMmemory) if command_ps.returncode == 0: with open(tmpVcf, 'wt') as outf: outf.write(command_ps.stdout.decode("utf-8")) if outVcf.endswith('.vcf.gz'): pysam.tabix_compress(tmpVcf, outVcf, force=True) pysam.tabix_index(outVcf, force=True, preset='vcf') os.unlink(tmpVcf) else: raise subprocess.CalledProcessError(cmd=command_ps.args, returncode=command_ps.returncode, output=command_ps.stdout)
def testEmptyFileVCFGZWithoutIndex(self): with get_temp_context("tmp_testEmptyFileWithoutIndex.vcf") as fn: with open(fn, "w"): pass pysam.tabix_compress(fn, fn + ".gz", force=True) self.assertRaises(ValueError, pysam.VariantFile, fn + ".gz")
def indexFile(input_file): sys.stdout.write('Compressing file... ') sys.stdout.flush() pysam.tabix_compress(input_file, input_file + '.gz', force=True) sys.stdout.write('OK\n') sys.stdout.write('Indexing output file... ') sys.stdout.flush() pysam.tabix_index(input_file + '.gz', seq_col=4, start_col=6, end_col=7, meta_char='#', force=True) sys.stdout.write('OK\n')
def run_nfr(args): """run nfr calling """ if args.bam is None and args.ins_track is None: raise Exception("Must supply either bam file or insertion track") if not args.out: args.out = '.'.join(os.path.basename(args.calls).split('.')[0:-3]) if args.fasta is not None: chrs_fasta = read_chrom_sizes_from_fasta(args.fasta) pwm = PWM.open(args.pwm) chunks = ChunkList.read(args.bed, chromDict = chrs_fasta, min_offset = max(pwm.up, pwm.down)) else: chunks = ChunkList.read(args.bed) if args.bam is not None: chrs_bam = read_chrom_sizes_from_bam(args.bam) chunks.checkChroms(chrs_bam, chrom_source = "BAM file") chunks.merge() maxQueueSize = args.cores * 10 params = NFRParameters(args.occ_track, args.calls, args.ins_track, args.bam, max_occ = args.max_occ, max_occ_upper = args.max_occ_upper, fasta = args.fasta, pwm = args.pwm) sets = chunks.split(items = args.cores * 5) pool1 = mp.Pool(processes = max(1,args.cores-1)) nfr_handle = open(args.out + '.nfrpos.bed','w') nfr_handle.close() nfr_queue = mp.JoinableQueue() nfr_process = mp.Process(target = _writeNFR, args=(nfr_queue, args.out)) nfr_process.start() if params.ins_track is None: ins_handle = open(args.out + '.ins.bedgraph','w') ins_handle.close() ins_queue = mp.JoinableQueue() ins_process = mp.Process(target = _writeIns, args=(ins_queue, args.out)) ins_process.start() for j in sets: tmp = pool1.map(_nfrHelper, zip(j,itertools.repeat(params))) for result in tmp: if params.ins_track is None: nfr_queue.put(result[0]) ins_queue.put(result[1]) else: nfr_queue.put(result) pool1.close() pool1.join() nfr_queue.put('STOP') nfr_process.join() if params.ins_track is None: ins_queue.put('STOP') ins_process.join() pysam.tabix_compress(args.out + '.nfrpos.bed', args.out + '.nfrpos.bed.gz',force = True) shell_command('rm ' + args.out + '.nfrpos.bed') pysam.tabix_index(args.out + '.nfrpos.bed.gz', preset = "bed", force = True) if params.ins_track is None: pysam.tabix_compress(args.out + '.ins.bedgraph', args.out + '.ins.bedgraph.gz', force = True) shell_command('rm ' + args.out + '.ins.bedgraph') pysam.tabix_index(args.out + '.ins.bedgraph.gz', preset = "bed", force = True)
def indexFile(f, options): sys.stdout.write(f'Compressing output file {f}... ') sys.stdout.flush() pysam.tabix_compress(os.path.join(options.output_dir, f), os.path.join(options.output_dir, f + '.gz'), force=True) sys.stdout.write('OK\n') sys.stdout.write(f'Indexing output file {f}... ') sys.stdout.flush() pysam.tabix_index(os.path.join(options.output_dir, f + '.gz'), seq_col=4, start_col=6, end_col=7, meta_char='#', force=True) sys.stdout.write('OK\n')
def _index_with_tabix(self): """Compress and index output file by Tabix""" pysam.tabix_compress(self._fn + '_tmp', self._fn + '.gz', force=True) pysam.tabix_index(self._fn + '.gz', seq_col=self.idx_chrom, start_col=self.idx_start, end_col=self.idx_end, meta_char='#', force=True)
def convert_VariantFile_to_IndexedVariantFile(vf_path, ivf_path): make_basedir(ivf_path) tmp_path = get_tmp_path(ivf_path) pysam.tabix_compress(vf_path, tmp_path, force=True) os.rename(tmp_path, ivf_path) pysam.tabix_index( filename=ivf_path, force=True, seq_col=0, start_col=1, end_col=1, # note: `pysam.tabix_index` calls the first column `0`, but cmdline `tabix` calls it `1`. line_skip=1, # skip header )
def testEmptyFileVCFGZ(self): with open("tmp_testEmptyFile.vcf", "w"): pass pysam.tabix_compress("tmp_testEmptyFile.vcf", "tmp_testEmptyFile.vcf.gz") self.assertRaises(ValueError, pysam.VariantFile, "tmp_testEmptyFile.vcf.gz") os.unlink("tmp_testEmptyFile.vcf") os.unlink("tmp_testEmptyFile.vcf.gz")
def bamTobed(bamInput=None, bedOutput=None, compress=True): # generate temp file for sorting and indexing bedOutput_path = os.path.realpath(bedOutput) this_pid = os.getpid() tmp_split = os.path.splitext(bedOutput_path) tmp_bedOutput = tmp_split[0] + "-temp-" + str(this_pid) + tmp_split[1] bai = bamInput + ".bai" if not os.path.exists(bai): message = "Index file " + bai + " do not exist!" raise commonError(message) bedWrite = open(tmp_bedOutput, "w") input_file = pysam.Samfile(bamInput, "rb") chr_reference = input_file.references for read1, read2 in read_pair_generator(input_file): read1Start = read1.reference_start read1End = read1.reference_end read2Start = read2.reference_start read2End = read2.reference_end if not read1.is_reverse: # read1 is forward strand, read2 is reverse strand rstart = read1Start # 0-based left-most site rend = read2End else: # read1 is reverse strand, read2 is forward strand rstart = read2Start # 0-based left-most site rend = read1End if (rstart < 0) or (rend < 0) or (rstart >= rend): continue tmp_str = chr_reference[read1.tid] + "\t" + str(rstart) + "\t" + str( rend) + "\n" bedWrite.write(tmp_str) bedWrite.close() print("Fragments generated, waitting for sorting......") bedData = pybedtools.BedTool(tmp_bedOutput) bedData.sort(output=bedOutput) os.remove(tmp_bedOutput) print("Fragments sorted.") if compress: print("Waitting for compressing and indexing......") bedgzfile = bedOutput + ".gz" pysam.tabix_compress(bedOutput, bedgzfile, force=False) pysam.tabix_index(bedgzfile, preset="bed", zerobased=True) print("Indexing bedgz file finished!") return True
def run_merge(args): if not args.out: args.out = '.'.join(os.path.basename(args.nucpos).split('.')[0:-3]) occ = NucList.read(args.occpeaks, "occ", args.min_occ) nuc = NucList.read(args.nucpos, "nuc", args.min_occ) new = merge(occ, nuc, args.sep) out = open(args.out + '.nucmap_combined.bed','w') out.write(new.asBed()) out.close() pysam.tabix_compress(args.out + '.nucmap_combined.bed', args.out + '.nucmap_combined.bed.gz',force = True) shell_command('rm ' + args.out + '.nucmap_combined.bed') pysam.tabix_index(args.out + '.nucmap_combined.bed.gz', preset = "bed", force = True)
def run_diff(args, bases=500000): """run differential occupancy calling """ chrs = read_chrom_sizes_from_bam(args.bam) pwm = PWM.open(args.pwm) chunks = ChunkList.read(args.bed, chromDict=chrs, min_offset=args.flank + args.upper / 2 + max(pwm.up, pwm.down)) chunks.merge() maxQueueSize = max( 2, int(100 * bases / np.mean([chunk.length() for chunk in chunks]))) #get fragmentsizes fragment_dist1 = FragmentMixDistribution(0, upper=args.upper) fragment_dist1.fragmentsizes = FragmentSizes( 0, args.upper, vals=FragmentSizes.open(args.sizes1).get(0, args.upper)) fragment_dist1.modelNFR() fragment_dist2 = FragmentMixDistribution(0, upper=args.upper) fragment_dist2.fragmentsizes = FragmentSizes( 0, args.upper, vals=FragmentSizes.open(args.sizes2).get(0, args.upper)) fragment_dist2.modelNFR() params = OccupancyParameters(fragment_dist, args.upper, args.fasta, args.pwm, sep=args.nuc_sep, min_occ=args.min_occ, flank=args.flank, bam=args.bam, ci=args.confidence_interval) sets = chunks.split(bases=bases) pool1 = mp.Pool(processes=max(1, args.cores - 1)) diff_handle = open(args.out + '.occdiff.bed', 'w') diff_handle.close() diff_queue = mp.JoinableQueue() diff_process = mp.Process(target=_writeDiff, args=(diff_queue, args.out)) diff_process.start() nuc_dist = np.zeros(args.upper) for j in sets: tmp = pool1.map(_occHelper, zip(j, itertools.repeat(params))) for result in tmp: diff_queue.put(result[1]) pool1.close() pool1.join() diff_queue.put('STOP') diff_process.join() pysam.tabix_compress(args.out + '.occdiff.bed', args.out + '.occdiff.bed.gz', force=True) shell_command('rm ' + args.out + '.occdiff.bed') pysam.tabix_index(args.out + '.occdiff.bed.gz', preset="bed", force=True)
def testEmptyFileVCFGZWithoutIndex(self): with open("tests/tmp_testEmptyFileWithoutIndex.vcf", "w"): pass pysam.tabix_compress("tests/tmp_testEmptyFileWithoutIndex.vcf", "tests/tmp_testEmptyFileWithoutIndex.vcf.gz", force=True) self.assertRaises(ValueError, pysam.VariantFile, "tests/tmp_testEmptyFileWithoutIndex.vcf.gz") os.unlink("tests/tmp_testEmptyFileWithoutIndex.vcf") os.unlink("tests/tmp_testEmptyFileWithoutIndex.vcf.gz")
def bgzip_file(original_file, new_file, delete_original=False, force=True): """ :param original_file: :param new_file: :param force: :return: """ pysam.tabix_compress(original_file, new_file, force) if delete_original: delete_file(original_file)
def testEmptyFileVCFGZWithoutIndex(self): with open("tmp_testEmptyFileWithoutIndex.vcf", "w"): pass pysam.tabix_compress("tmp_testEmptyFileWithoutIndex.vcf", "tmp_testEmptyFileWithoutIndex.vcf.gz", force=True) self.assertRaises(ValueError, pysam.VariantFile, "tmp_testEmptyFileWithoutIndex.vcf.gz") os.unlink("tmp_testEmptyFileWithoutIndex.vcf") os.unlink("tmp_testEmptyFileWithoutIndex.vcf.gz")
def indexFile(options): sys.stdout.write('Compressing output file ... ') sys.stdout.flush() pysam.tabix_compress(options.output, options.output + '.gz', force=True) sys.stdout.write('OK\n') sys.stdout.write('Indexing output file ... ') sys.stdout.flush() pysam.tabix_index(options.output + '.gz', seq_col=1, start_col=2, end_col=2, meta_char='#', force=True) sys.stdout.write('OK\n')
def run_nfr(args): """run nfr calling """ if args.bam is None and args.ins_track is None: raise Exception("Must supply either bam file or insertion track") if not args.out: args.out = '.'.join(os.path.basename(args.calls).split('.')[0:-3]) chunks = ChunkList.read(args.bed) chunks.merge() maxQueueSize = args.cores * 10 params = NFRParameters(args.occ_track, args.calls, args.ins_track, args.bam, max_occ = args.max_occ, max_occ_upper = args.max_occ_upper, fasta = args.fasta, pwm = args.pwm) sets = chunks.split(items = args.cores * 5) pool1 = mp.Pool(processes = max(1,args.cores-1)) nfr_handle = open(args.out + '.nfrpos.bed','w') nfr_handle.close() nfr_queue = mp.JoinableQueue() nfr_process = mp.Process(target = _writeNFR, args=(nfr_queue, args.out)) nfr_process.start() if params.ins_track is None: ins_handle = open(args.out + '.ins.bedgraph','w') ins_handle.close() ins_queue = mp.JoinableQueue() ins_process = mp.Process(target = _writeIns, args=(ins_queue, args.out)) ins_process.start() for j in sets: tmp = pool1.map(_nfrHelper, zip(j,itertools.repeat(params))) for result in tmp: if params.ins_track is None: nfr_queue.put(result[0]) ins_queue.put(result[1]) else: nfr_queue.put(result) pool1.close() pool1.join() nfr_queue.put('STOP') nfr_process.join() if params.ins_track is None: ins_queue.put('STOP') ins_process.join() pysam.tabix_compress(args.out + '.nfrpos.bed', args.out + '.nfrpos.bed.gz',force = True) shell_command('rm ' + args.out + '.nfrpos.bed') pysam.tabix_index(args.out + '.nfrpos.bed.gz', preset = "bed", force = True) if params.ins_track is None: pysam.tabix_compress(args.out + '.ins.bedgraph', args.out + '.ins.bedgraph.gz', force = True) shell_command('rm ' + args.out + '.ins.bedgraph') pysam.tabix_index(args.out + '.ins.bedgraph.gz', preset = "bed", force = True)
def run_merge(args): if not args.out: args.out = '.'.join(os.path.basename(args.nucpos).split('.')[0:-3]) occ = NucList.read(args.occpeaks, "occ", args.min_occ) nuc = NucList.read(args.nucpos, "nuc", args.min_occ) new = merge(occ, nuc, args.sep) out = open(args.out + '.nucmap_combined.bed', 'w') out.write(new.asBed()) out.close() pysam.tabix_compress(args.out + '.nucmap_combined.bed', args.out + '.nucmap_combined.bed.gz', force=True) shell_command('rm ' + args.out + '.nucmap_combined.bed') pysam.tabix_index(args.out + '.nucmap_combined.bed.gz', preset="bed", force=True)
def indexFile(options): filename=options.output if not options.ensembl is None: sys.stdout.write('Compressing output file... ') sys.stdout.flush() pysam.tabix_compress(filename,filename+'.gz',force=True) sys.stdout.write('OK\n') sys.stdout.write('Indexing output file... ') sys.stdout.flush() pysam.tabix_index(filename+'.gz', seq_col=2, start_col=4, end_col=5, meta_char='#',force=True) sys.stdout.write('OK\n') else: print 'Compressing file...' pysam.tabix_compress(filename,filename+'.gz',force=True) print 'Indexing file...' pysam.tabix_index(filename+'.gz', seq_col=1, start_col=2, end_col=2, meta_char='#',force=True)
def indexFile(f): sys.stdout.write(f'Compressing output file {f}... ') sys.stdout.flush() assert os.path.exists(f), f"{f} does not exist" pysam.tabix_compress(f, f + '.gz', force=True) sys.stdout.write('OK\n') if os.path.exists(f): os.remove(f) sys.stdout.write(f'Indexing output file {f}.gz... ') sys.stdout.flush() pysam.tabix_index(f + '.gz', seq_col=4, start_col=6, end_col=7, meta_char='#', force=True) sys.stdout.write('OK\n')
def bgzip_index(original_file, new_file, file_format): """ :param original_file: :param new_file: :param file_format: :return: """ if file_format.lower() == 'fa': tabix_compress(original_file, new_file) faidx(new_file) delete_file(original_file) elif file_format.lower() == 'vcf': tabix_index(original_file, preset="vcf", force=True) else: raise G2GValueError("Unknown file format: {0}".format(file_format))
def main(): # Read options, args. parser = optparse.OptionParser() parser.add_option('-c', '--chr-col', type='int', dest='chrom_col') parser.add_option('-s', '--start-col', type='int', dest='start_col') parser.add_option('-e', '--end-col', type='int', dest='end_col') parser.add_option('-P', '--preset', dest='preset') (options, args) = parser.parse_args() input_fname, output_fname = args tmpfile = tempfile.NamedTemporaryFile() sort_params = None if options.chrom_col and options.start_col and options.end_col: sort_params = [ "sort", "-k%(i)s,%(i)s" % { 'i': options.chrom_col }, "-k%(i)i,%(i)in" % { 'i': options.start_col }, "-k%(i)i,%(i)in" % { 'i': options.end_col } ] elif options.preset == "bed": sort_params = ["sort", "-k1,1", "-k2,2n", "-k3,3n"] elif options.preset == "vcf": sort_params = ["sort", "-k1,1", "-k2,2n"] elif options.preset == "gff": sort_params = ["sort", "-s", "-k1,1", "-k4,4n"] # stable sort on start column # Skip any lines starting with "#" and "track" grepped = subprocess.Popen( ["grep", "-e", "^\"#\"", "-e", "^track", "-v", input_fname], stderr=subprocess.PIPE, stdout=subprocess.PIPE) after_sort = subprocess.Popen(sort_params, stdin=grepped.stdout, stderr=subprocess.PIPE, stdout=tmpfile) grepped.stdout.close() output, err = after_sort.communicate() pysam.tabix_compress(tmpfile.name, output_fname, force=True)
def bgzip(in_fn, remove = True): """ convert file to bgzipped format """ if is_gz_file(in_fn): tmp_out_fn = in_fn.replace(".gz", "") out_fn = in_fn.replace(".gz", ".bgz") ungzip(in_fn, tmp_out_fn) else: tmp_out_fn = in_fn out_fn = in_fn + ".bgz" pysam.tabix_compress(tmp_out_fn, out_fn, force = True) if remove: os.unlink(tmp_out_fn) return out_fn
def _make_assembly_vcf(self): tmp_vcf = self.final_assembly_vcf + '.tmp' cmd = ' '.join([ self.samtools_exe, 'mpileup', '-t INFO/DPR,DV', '-A', '-f', self.final_assembly_fa, '-u', '-v', self.final_assembly_bam, '>', tmp_vcf ]) common.syscall(cmd, verbose=self.verbose) cmd = ' '.join([ self.bcftools_exe, 'call -m', tmp_vcf, '|', self.bcftools_exe, 'query', r'''-f '%CHROM\t%POS\t%REF\t%ALT\t%DP\t%DPR]\n' ''', '>', self.final_assembly_read_depths + '.tmp' ]) common.syscall(cmd, verbose=self.verbose) pysam.tabix_compress(self.final_assembly_read_depths + '.tmp', self.final_assembly_read_depths) pysam.tabix_index(self.final_assembly_read_depths, seq_col=0, start_col=1, end_col=1) os.unlink(self.final_assembly_read_depths + '.tmp') cmd = ' '.join([ self.bcftools_exe, 'call -m -v', tmp_vcf, '|', self.bcftools_exe, 'filter', '-i', '"MIN(DP)>=' + str(self.bcf_min_dp), ' & MIN(DV)>=' + str(self.bcf_min_dv), ' & MIN(DV/DP)>=' + str(self.bcf_min_dv_over_dp), ' & QUAL >=', str(self.bcf_min_qual), '"', '-o', self.final_assembly_vcf ]) common.syscall(cmd, verbose=self.verbose) os.unlink(tmp_vcf)
def main(): # Read options, args. usage = "Usage: %prog [options] tabular_input_file bgzip_output_file" parser = optparse.OptionParser(usage=usage) parser.add_option('-c', '--chr-col', type='int', default=0, dest='chrom_col') parser.add_option('-s', '--start-col', type='int', default=1, dest='start_col') parser.add_option('-e', '--end-col', type='int', default=1, dest='end_col') (options, args) = parser.parse_args() if len(args) != 2: parser.print_usage() exit(1) input_fname, output_fname = args output_dir = os.path.dirname(output_fname) if not os.path.exists(output_dir): os.makedirs(output_dir) pysam.tabix_compress(input_fname, output_fname, force=True) # Column indices are 0-based. pysam.tabix_index(output_fname, seq_col=options.chrom_col, start_col=options.start_col, end_col=options.end_col)
def __init__(self, fileName, samples): self.samples = samples self.sampleIndexes = [] self.nColumns = 0 # Compress with bgzip if not fileName.endswith('.gz'): if not os.path.isfile(fileName+'.gz'): pysam.tabix_compress(fileName, fileName+'.gz') fileName += '.gz' # Build tabix index if not os.path.isfile(fileName+'.tbi'): pysam.tabix_index(fileName, preset='vcf') nLines = 0 fp = gzip.open(fileName, 'r') line = fp.readline() while line: nLines += 1 if line.startswith('##'): line = fp.readline() elif line.startswith('#'): # Header line break else: line = None # Content line, no header line found else: raise ValueError("Header not found.") # Get the column index of selected samples headers = line[1:].rstrip().split(FS) self.nColumns = len(headers) if self.nColumns <= 9: raise ValueError("Not enough columns in header.") for name in self.samples: if name in headers[9:]: self.sampleIndexes.append(headers.index(name)) else: raise ValueError("Sample %s not found in header." % name) self.tabix = pysam.Tabixfile(fileName) self.chroms = self.tabix.contigs self.fileName = fileName
def convert_VariantFile_to_IndexedVariantFile(vf_path: str, ivf_path: str) -> None: make_basedir(ivf_path) tmp_path = get_tmp_path(ivf_path) tmp_path = '{}/cvt-{}'.format( os.path.dirname(tmp_path), os.path.basename( tmp_path)) # Avoid using the same tmp path as augment-phenos pysam.tabix_compress(vf_path, tmp_path, force=True) os.rename(tmp_path, ivf_path) pysam.tabix_index( filename=ivf_path, force=True, seq_col=0, start_col=1, end_col= 1, # note: `pysam.tabix_index` calls the first column `0`, but cmdline `tabix` calls it `1`. line_skip=1, # skip header )
def make_bias_track(args, bases=500000, splitsize=1000): """function to compute bias track """ if args.out is None: if args.bed is not None: args.out = '.'.join(os.path.basename(args.bed).split('.')[0:-1]) else: args.out = '.'.join(os.path.basename(args.fasta).split('.')[0:-1]) params = _BiasParams(args.fasta, args.pwm) if args.bed is None: chunks = ChunkList.convertChromSizes(params.chrs, splitsize=splitsize) sets = chunks.split(items=bases // splitsize) else: chunks = ChunkList.read(args.bed) chunks.checkChroms(list(params.chrs.keys())) chunks.merge() sets = chunks.split(bases=bases) maxQueueSize = max( 2, int(2 * bases / np.mean([chunk.length() for chunk in chunks]))) pool = mp.Pool(processes=max(1, args.cores - 1)) out_handle = open(args.out + '.Scores.bedgraph', 'w') out_handle.close() write_queue = mp.JoinableQueue(maxsize=maxQueueSize) write_process = mp.Process(target=_writeBias, args=(write_queue, args.out)) write_process.start() for j in sets: tmp = pool.map(_biasHelper, list(zip(j, itertools.repeat(params)))) for track in tmp: write_queue.put(track) pool.close() pool.join() write_queue.put('STOP') write_process.join() pysam.tabix_compress(args.out + '.Scores.bedgraph', args.out + '.Scores.bedgraph.gz', force=True) shell_command('rm ' + args.out + '.Scores.bedgraph') pysam.tabix_index(args.out + '.Scores.bedgraph.gz', preset="bed", force=True)
def get_ins(args, bases=50000, splitsize=1000): """function to get insertions """ if not args.out: if args.bed is None: args.out = '.'.join(os.path.basename(args.bam).split('.')[0:-1]) else: args.out = '.'.join(os.path.basename(args.bed).split('.')[0:-1]) if args.bed is None: chrs = read_chrom_sizes_from_bam(args.bam) chunks = ChunkList.convertChromSizes(chrs, splitsize=splitsize) sets = chunks.split(items=bases / splitsize) else: chunks = ChunkList.read(args.bed) chunks.merge() sets = chunks.split(bases=bases) maxQueueSize = max( 2, int(2 * bases / np.mean([chunk.length() for chunk in chunks]))) pool1 = mp.Pool(processes=max(1, args.cores - 1)) out_handle = open(args.out + '.ins.bedgraph', 'w') out_handle.close() write_queue = mp.JoinableQueue(maxsize=maxQueueSize) write_process = mp.Process(target=_writeIns, args=(write_queue, args.out)) write_process.start() for j in sets: if args.smooth: tmp = pool1.map(_insHelperSmooth, list(zip(j, itertools.repeat(args)))) else: tmp = pool1.map(_insHelper, list(zip(j, itertools.repeat(args)))) for track in tmp: write_queue.put(track) pool1.close() pool1.join() write_queue.put('STOP') write_process.join() pysam.tabix_compress(args.out + '.ins.bedgraph', args.out + '.ins.bedgraph.gz', force=True) shell_command('rm ' + args.out + '.ins.bedgraph') pysam.tabix_index(args.out + '.ins.bedgraph.gz', preset="bed", force=True)
def _make_vcf_and_read_depths_files(self): tmp_vcf = self.vcf_file + '.tmp' cmd = ' '.join([ self.samtools_exe, 'mpileup', '-t INFO/AD', '-A', '-f', self.ref_fa, '-u', '-v', self.bam, '>', tmp_vcf ]) common.syscall(cmd, verbose=True, verbose_filehandle=self.log_fh) cmd = ' '.join([ self.bcftools_exe, 'call -m', tmp_vcf, '|', self.bcftools_exe, 'query', r'''-f '%CHROM\t%POS\t%REF\t%ALT\t%DP\t%AD]\n' ''', '>', self.read_depths_file + '.tmp' ]) common.syscall(cmd, verbose=True, verbose_filehandle=self.log_fh) pysam.tabix_compress(self.read_depths_file + '.tmp', self.read_depths_file) pysam.tabix_index(self.read_depths_file, seq_col=0, start_col=1, end_col=1) os.unlink(self.read_depths_file + '.tmp') cmd = ' '.join([ self.bcftools_exe, 'call -m -v', tmp_vcf, '|', self.bcftools_exe, 'filter', '-i', '"SUM(AD)>=5 & MIN(AD)/DP>=0.1"', '-o', self.vcf_file ]) common.syscall(cmd, verbose=True, verbose_filehandle=self.log_fh) os.unlink(tmp_vcf)
def compress_tabix(self, **kwargs): force = kwargs.get('force', False) timing = kwargs.get('timing', False) t1 = time.time() if self.__filename_compressed_exists and not force: sys.stderr.write("%s exists\n" % self.filename_compressed) return False else: try: pysam.tabix_compress(self.filename, self.filename_compressed, force=force) self.__filename_compressed_exists = True t2 = time.time() if timing: sys.stderr.write('Total time: %s\n' % (t2 - t1)) return True except: sys.stderr.write('Unexpected error during compression: %s\n' % sys.exc_info()[1]) return False
def __init__(self, fileNames): """ Constructor. Takes the name of a vcf file. """ for fileName in fileNames: if ".gz" not in fileName: try: pysam.tabix_compress(fileName, fileName + ".gz") fileName += ".gz" except IOError: pass try: pysam.tabix_index(fileName, preset="vcf") except IOError: pass self.vcfFiles = [ pysam.Tabixfile(fileName) for fileName in fileNames ]
def bgzip_index(inFile, outFile, params={}, tabixPath=None): assert not inFile.endswith('.gz') and outFile.endswith('.gz') log.debug("compressing with bgzip %s -> %s" % (inFile, outFile)) if tabixPath: cmdline = "%s/bgzip -c %s > %s" % (tabixPath, inFile, outFile) assert not os.system(cmdline) else: pysam.tabix_compress(self.tmpFile, self.outFile, force=True) log.debug("indexing with tabix: %s" % outFile) if tabixPath: cmdline = "%s/tabix %s -f" % (tabixPath, outFile) if params.get('seq_col') != None: cmdline += ' -s %d' % params['seq_col'] if params.get('start_col') != None: cmdline += ' -b %d' % params['start_col'] if params.get('end_col') != None: cmdline += ' -e %d' % params['end_col'] if params.get('preset') != None: cmdline += ' -p %s' % params['preset'] if params.get('meta_char') != None: cmdline += ' -c %s' % params['meta_char'] if params.get('line_skip') != None: cmdline += ' -S %d' % params['line_skip'] if params.get('zerobased') != None: cmdline += ' -0' assert not os.system(cmdline) else: assert not params.get( 'line_skip' ), "error: pysam does not seem to support this option, even though their documentation talks about it" pysam.tabix_index(self.outFile, force=True, seq_col=params.get('seq_col'), start_col=params.get('start_col'), end_col=params.get('end_col'), preset=params.get('preset'), meta_char=params.get('meta_char', '#'), zerobased=params.get('zerobased', False)) return outFile
def index_gtf(file_path, output_path, sort=True, force=True): # type: (pathlib.Path, pathlib.Path) -> None """Compresses and indexes a gtf file using bgzip and tabix.""" # Sort file before compressing and indexing. if sort: sorted_path = _append_suffix(output_path, '.srt') sort_gtf(file_path, output_path=sorted_path) else: sorted_path = file_path # Gzip and index file. pysam.tabix_compress( native_str(sorted_path), filename_out=native_str(output_path), force=force) pysam.tabix_index(native_str(output_path), preset='gff', force=force) # Clean up sort temp file. if sort: sorted_path.unlink()
def compress_depth_file( filename, outfile=None, delete_file=True): """compresses a tab file. Args: filename (str): file to compress outfile ( str): name of compressed file, default is [filename].gz delete_file (bool): delete original file after compression, default is True Returns: filename (str): filename of compressed file """ if ( outfile is None): outfile = "{}.gz".format( filename ) pysam.tabix_compress( filename, outfile , force=True ) if ( delete_file ): os.unlink( filename ) return outfile
def genotype_single_sample(bam, vcf_in, out_dir): lib_info_json = bam + ".json" sample = fetchId(bam) out_vcf = os.path.join(out_dir, sample + ".gt.vcf") with open(vcf_in, "r") as inf, open(out_vcf, "w") as outf: single.sso_genotype(bam_string=bam, vcf_in=inf, vcf_out=outf, min_aligned=20, split_weight=1, disc_weight=1, num_samp=1000000, lib_info_path=lib_info_json, debug=False, ref_fasta=None, sum_quals=False, max_reads=1000, cores=None, batch_size=None) out_gz = out_vcf + ".gz" tabix_compress(out_vcf, out_gz, force=True) tabix_index(out_gz, force=True, preset="vcf") return out_gz
def output_gff2(gff2_lines, fn): out = open(fn + '_tmp', 'w') out.write('##gff-version 2\n') for c in map(str, range(1, 23)) + ['X', 'Y', 'MT']: if c not in gff2_lines: continue gff2_lines[c] = sorted(gff2_lines[c], key=itemgetter(3, 4)) for x in gff2_lines[c]: x = map(str, x) out.write('\t'.join(x) + '\n') out.close() pysam.tabix_compress(fn + '_tmp', fn + '.gz', force=True) pysam.tabix_index(fn + '.gz', seq_col=0, start_col=3, end_col=4, meta_char='#', force=True) os.remove(fn + '_tmp')
def run_diff(args, bases = 500000): """run differential occupancy calling """ chrs = read_chrom_sizes_from_bam(args.bam) pwm = PWM.open(args.pwm) chunks = ChunkList.read(args.bed, chromDict = chrs, min_offset = args.flank + args.upper/2 + max(pwm.up,pwm.down)) chunks.merge() maxQueueSize = max(2,int(100 * bases / np.mean([chunk.length() for chunk in chunks]))) #get fragmentsizes fragment_dist1 = FragmentMixDistribution(0, upper = args.upper) fragment_dist1.fragmentsizes = FragmentSizes(0, args.upper, vals = FragmentSizes.open(args.sizes1).get(0,args.upper)) fragment_dist1.modelNFR() fragment_dist2 = FragmentMixDistribution(0, upper = args.upper) fragment_dist2.fragmentsizes = FragmentSizes(0, args.upper, vals = FragmentSizes.open(args.sizes2).get(0,args.upper)) fragment_dist2.modelNFR() params = OccupancyParameters(fragment_dist, args.upper, args.fasta, args.pwm, sep = args.nuc_sep, min_occ = args.min_occ, flank = args.flank, bam = args.bam, ci = args.confidence_interval) sets = chunks.split(bases = bases) pool1 = mp.Pool(processes = max(1,args.cores-1)) diff_handle = open(args.out + '.occdiff.bed','w') diff_handle.close() diff_queue = mp.JoinableQueue() diff_process = mp.Process(target = _writeDiff, args=(diff_queue, args.out)) diff_process.start() nuc_dist = np.zeros(args.upper) for j in sets: tmp = pool1.map(_occHelper, zip(j,itertools.repeat(params))) for result in tmp: diff_queue.put(result[1]) pool1.close() pool1.join() diff_queue.put('STOP') diff_process.join() pysam.tabix_compress(args.out + '.occdiff.bed', args.out + '.occdiff.bed.gz',force = True) shell_command('rm ' + args.out + '.occdiff.bed') pysam.tabix_index(args.out + '.occdiff.bed.gz', preset = "bed", force = True)
def bgzip_index(inFile, outFile, params={}, tabixPath=None): assert not inFile.endswith('.gz') and outFile.endswith('.gz') log.debug("compressing with bgzip %s -> %s" % (inFile, outFile)) if tabixPath: cmdline = "%s/bgzip -c %s > %s" % (tabixPath, inFile, outFile) assert not os.system(cmdline) else: pysam.tabix_compress(self.tmpFile, self.outFile, force=True) log.debug("indexing with tabix: %s" % outFile) if tabixPath: cmdline = "%s/tabix %s -f" % (tabixPath, outFile) if params.get('seq_col')!=None: cmdline += ' -s %d' % params['seq_col'] if params.get('start_col')!=None: cmdline += ' -b %d' % params['start_col'] if params.get('end_col')!=None: cmdline += ' -e %d' % params['end_col'] if params.get('preset')!=None: cmdline += ' -p %s' % params['preset'] if params.get('meta_char')!=None: cmdline += ' -c %s' % params['meta_char'] if params.get('line_skip')!=None: cmdline += ' -S %d' % params['line_skip'] if params.get('zerobased')!=None: cmdline += ' -0' assert not os.system(cmdline) else: assert not params.get('line_skip'), "error: pysam does not seem to support this option, even though their documentation talks about it" pysam.tabix_index(self.outFile, force=True, seq_col=params.get('seq_col'), start_col=params.get('start_col'), end_col=params.get('end_col'), preset=params.get('preset'), meta_char=params.get('meta_char','#'), zerobased=params.get('zerobased',False)) return outFile
def _make_vcf_and_read_depths_files(self): if not os.path.exists(self.ref_fa + '.fai'): pysam.faidx(self.ref_fa) tmp_vcf = self.vcf_file + '.tmp' with open(tmp_vcf, 'w') as f: print(pysam.mpileup( '-t', 'INFO/AD,INFO/ADF,INFO/ADR', '-L', '99999999', '-A', '-f', self.ref_fa, '-u', '-v', self.bam, ), end='', file=f) got = vcfcall_ariba.vcfcall_ariba(tmp_vcf, self.outprefix, self.min_var_read_depth, self.min_second_var_read_depth, self.max_allele_freq) if got != 0: raise Error('Error parsing vcf file. Cannot contine') pysam.tabix_compress(self.outprefix + '.read_depths', self.read_depths_file) pysam.tabix_index(self.read_depths_file, seq_col=0, start_col=1, end_col=1) os.unlink(self.outprefix + '.read_depths') os.unlink(tmp_vcf)
def run_occ(args): """run occupancy calling """ if args.fasta: chrs = read_chrom_sizes_from_fasta(args.fasta) else: chrs = read_chrom_sizes_from_bam(args.bam) pwm = PWM.open(args.pwm) chunks = ChunkList.read(args.bed, chromDict = chrs, min_offset = args.flank + args.upper/2 + max(pwm.up,pwm.down) + args.nuc_sep/2) chunks.slop(chrs, up = args.nuc_sep/2, down = args.nuc_sep/2) chunks.merge() maxQueueSize = args.cores*10 fragment_dist = FragmentMixDistribution(0, upper = args.upper) if args.sizes is not None: tmp = FragmentSizes.open(args.sizes) fragment_dist.fragmentsizes = FragmentSizes(0, args.upper, vals = tmp.get(0,args.upper)) else: fragment_dist.getFragmentSizes(args.bam, chunks) fragment_dist.modelNFR() fragment_dist.plotFits(args.out + '.occ_fit.eps') fragment_dist.fragmentsizes.save(args.out + '.fragmentsizes.txt') params = OccupancyParameters(fragment_dist, args.upper, args.fasta, args.pwm, sep = args.nuc_sep, min_occ = args.min_occ, flank = args.flank, bam = args.bam, ci = args.confidence_interval, step = args.step) sets = chunks.split(items = args.cores * 5) pool1 = mp.Pool(processes = max(1,args.cores-1)) out_handle1 = open(args.out + '.occ.bedgraph','w') out_handle1.close() out_handle2 = open(args.out + '.occ.lower_bound.bedgraph','w') out_handle2.close() out_handle3 = open(args.out + '.occ.upper_bound.bedgraph','w') out_handle3.close() write_queue = mp.JoinableQueue(maxsize = maxQueueSize) write_process = mp.Process(target = _writeOcc, args=(write_queue, args.out)) write_process.start() peaks_handle = open(args.out + '.occpeaks.bed','w') peaks_handle.close() peaks_queue = mp.JoinableQueue() peaks_process = mp.Process(target = _writePeaks, args=(peaks_queue, args.out)) peaks_process.start() nuc_dist = np.zeros(args.upper) for j in sets: tmp = pool1.map(_occHelper, zip(j,itertools.repeat(params))) for result in tmp: nuc_dist += result[0] write_queue.put(result[1]) peaks_queue.put(result[2]) pool1.close() pool1.join() write_queue.put('STOP') peaks_queue.put('STOP') write_process.join() peaks_process.join() pysam.tabix_compress(args.out + '.occpeaks.bed', args.out + '.occpeaks.bed.gz',force = True) shell_command('rm ' + args.out + '.occpeaks.bed') pysam.tabix_index(args.out + '.occpeaks.bed.gz', preset = "bed", force = True) for i in ('occ','occ.lower_bound','occ.upper_bound'): pysam.tabix_compress(args.out + '.' + i + '.bedgraph', args.out + '.'+i+'.bedgraph.gz',force = True) shell_command('rm ' + args.out + '.' + i + '.bedgraph') pysam.tabix_index(args.out + '.' + i + '.bedgraph.gz', preset = "bed", force = True) dist_out = FragmentSizes(0, args.upper, vals = nuc_dist) dist_out.save(args.out + '.nuc_dist.txt') print "Making figure" #make figure fig = plt.figure() plt.plot(range(0,args.upper),dist_out.get(0,args.upper),label = "Nucleosome Distribution") plt.xlabel("Fragment Size") plt.ylabel("Frequency") fig.savefig(args.out+'.nuc_dist.eps') plt.close(fig)
def run(self): if not os.path.exists(self.outputDirectory): os.makedirs(self.outputDirectory) # Clean out, make and re-populate references directory # For now, assume a single, statically-named referenceSet print("Converting references...", file=sys.stderr) shutil.rmtree(self.refsetsDirectory, ignore_errors=True) os.makedirs(self.refsetsDirectory) shutil.copy( os.path.join(self.inputDirectory, "referenceset_hg37.json"), os.path.join(self.refsetsDirectory, "hg37.json")) os.makedirs(self.hg37Directory) for refFile in self.referenceFiles: refBase = os.path.splitext(refFile)[0] destFastaFilename = os.path.join( self.hg37Directory, refBase) + ".fa" shutil.copy(os.path.join(self.inputDirectory, refBase) + ".fa", destFastaFilename) pysam.tabix_compress(destFastaFilename, destFastaFilename + ".gz") refFasta = pysam.FastaFile(destFastaFilename + ".gz") refFasta.close() os.remove(destFastaFilename) shutil.copy( os.path.join(self.inputDirectory, refBase) + ".json", os.path.join(self.hg37Directory, refBase) + ".json") # Clean out, make and repopulate dataset directories shutil.rmtree(self.datasetsDirectory, ignore_errors=True) os.makedirs(self.datasetsDirectory) for ds in self.datasets: dsdir = os.path.join(self.datasetsDirectory, ds) os.makedirs(dsdir) # Reads print("Converting reads...", file=sys.stderr) dsReadsdir = os.path.join(dsdir, "reads") os.makedirs(dsReadsdir) for readFile in self.datasetReads[ds]: destFile = os.path.join( dsReadsdir, readFile.split('_')[1].split('.')[0]) + ".bam" readSrc = pysam.AlignmentFile( os.path.join(self.inputDirectory, readFile), "r") readDest = pysam.AlignmentFile(destFile, "wb", header=readSrc.header) destFilePath = readDest.filename for readData in readSrc: readDest.write(readData) readDest.close() readSrc.close() pysam.index(destFilePath) # Variants print("Converting variants...", file=sys.stderr) dsVariantsdir = os.path.join(dsdir, "variants") os.makedirs(dsVariantsdir) for vgroup in self.datasetVariants[ds].keys(): vgroupdir = os.path.join(dsVariantsdir, vgroup) os.makedirs(vgroupdir) for variantFile in self.datasetVariants[ds][vgroup]: destFile = os.path.join( vgroupdir, variantFile.split('_')[2]) shutil.copy( os.path.join( self.inputDirectory, variantFile), destFile) # Pysam's tabix_index automatically compresses the file # in place, creates a tabix index. pysam.tabix_index(destFile, preset="vcf") print("done converting compliance data.", file=sys.stderr)
def run(self): if not os.path.exists(self.outputDirectory): os.makedirs(self.outputDirectory) self.repo.open("w") self.repo.initialise() referenceFileName = "ref_brca1.fa" inputRef = os.path.join( self.inputDirectory, referenceFileName) outputRef = os.path.join( self.outputDirectory, referenceFileName) shutil.copy(inputRef, outputRef) fastaFilePath = os.path.join( self.outputDirectory, referenceFileName + '.gz') pysam.tabix_compress( outputRef, fastaFilePath) with open( os.path.join( self.inputDirectory, "ref_brca1.json")) as refMetadataFile: refMetadata = json.load(refMetadataFile) with open( os.path.join( self.inputDirectory, "referenceset_hg37.json")) as refMetadataFile: refSetMetadata = json.load(refMetadataFile) referenceSet = references.HtslibReferenceSet( refSetMetadata['assemblyId']) referenceSet.populateFromFile(fastaFilePath) referenceSet.setAssemblyId(refSetMetadata['assemblyId']) referenceSet.setDescription(refSetMetadata['description']) referenceSet.setNcbiTaxonId(refSetMetadata['ncbiTaxonId']) referenceSet.setIsDerived(refSetMetadata['isDerived']) referenceSet.setSourceUri(refSetMetadata['sourceUri']) referenceSet.setSourceAccessions(refSetMetadata['sourceAccessions']) for reference in referenceSet.getReferences(): reference.setNcbiTaxonId(refMetadata['ncbiTaxonId']) reference.setSourceAccessions( refMetadata['sourceAccessions']) self.repo.insertReferenceSet(referenceSet) dataset = datasets.Dataset("brca1") self.repo.insertDataset(dataset) hg00096Individual = biodata.Individual(dataset, "HG00096") with open( os.path.join( self.inputDirectory, "individual_HG00096.json")) as jsonString: hg00096Individual.populateFromJson(jsonString.read()) self.repo.insertIndividual(hg00096Individual) hg00096BioSample = biodata.BioSample(dataset, "HG00096") with open( os.path.join( self.inputDirectory, "bioSample_HG00096.json")) as jsonString: hg00096BioSample.populateFromJson(jsonString.read()) hg00096BioSample.setIndividualId(hg00096Individual.getId()) self.repo.insertBioSample(hg00096BioSample) hg00099Individual = biodata.Individual(dataset, "HG00099") with open( os.path.join( self.inputDirectory, "individual_HG00099.json")) as jsonString: hg00099Individual.populateFromJson(jsonString.read()) self.repo.insertIndividual(hg00099Individual) hg00099BioSample = biodata.BioSample(dataset, "HG00099") with open( os.path.join( self.inputDirectory, "bioSample_HG00099.json")) as jsonString: hg00099BioSample.populateFromJson(jsonString.read()) hg00099BioSample.setIndividualId(hg00099Individual.getId()) self.repo.insertBioSample(hg00099BioSample) hg00101Individual = biodata.Individual(dataset, "HG00101") with open( os.path.join( self.inputDirectory, "individual_HG00101.json")) as jsonString: hg00101Individual.populateFromJson(jsonString.read()) self.repo.insertIndividual(hg00101Individual) hg00101BioSample = biodata.BioSample(dataset, "HG00101") with open( os.path.join( self.inputDirectory, "bioSample_HG00101.json")) as jsonString: hg00101BioSample.populateFromJson(jsonString.read()) hg00101BioSample.setIndividualId(hg00101Individual.getId()) self.repo.insertBioSample(hg00101BioSample) readFiles = [ "brca1_HG00096.sam", "brca1_HG00099.sam", "brca1_HG00101.sam"] for readFile in readFiles: name = readFile.split('_')[1].split('.')[0] readSrc = pysam.AlignmentFile( os.path.join(self.inputDirectory, readFile), "r") readDest = pysam.AlignmentFile( os.path.join( self.outputDirectory, name + ".bam"), "wb", header=readSrc.header) destFilePath = readDest.filename for readData in readSrc: readDest.write(readData) readDest.close() readSrc.close() pysam.index(destFilePath) readGroupSet = reads.HtslibReadGroupSet(dataset, name) readGroupSet.populateFromFile(destFilePath, destFilePath + ".bai") readGroupSet.setReferenceSet(referenceSet) bioSamples = [hg00096BioSample, hg00099BioSample, hg00101BioSample] for readGroup in readGroupSet.getReadGroups(): for bioSample in bioSamples: if bioSample.getLocalId() == readGroup.getSampleName(): readGroup.setBioSampleId(bioSample.getId()) self.repo.insertReadGroupSet(readGroupSet) ontologyMapFileName = "so-xp-simple.obo" inputOntologyMap = os.path.join( self.inputDirectory, ontologyMapFileName) outputOntologyMap = os.path.join( self.outputDirectory, ontologyMapFileName) shutil.copy(inputOntologyMap, outputOntologyMap) sequenceOntology = ontologies.Ontology("so-xp-simple") sequenceOntology.populateFromFile(outputOntologyMap) sequenceOntology._id = "so-xp-simple" self.repo.insertOntology(sequenceOntology) self.repo.addOntology(sequenceOntology) vcfFiles = [ "brca1_1kgPhase3_variants.vcf", "brca1_WASH7P_annotation.vcf", "brca1_OR4F_annotation.vcf"] for vcfFile in vcfFiles: self.addVariantSet( vcfFile, dataset, referenceSet, sequenceOntology, bioSamples) seqAnnFile = "brca1_gencodev19.gff3" seqAnnSrc = os.path.join(self.inputDirectory, seqAnnFile) seqAnnDest = os.path.join(self.outputDirectory, "gencodev19.db") dbgen = generate_gff3_db.Gff32Db(seqAnnSrc, seqAnnDest) dbgen.run() gencode = sequenceAnnotations.Gff3DbFeatureSet(dataset, "gencodev19") gencode.setOntology(sequenceOntology) gencode.populateFromFile(seqAnnDest) gencode.setReferenceSet(referenceSet) self.repo.insertFeatureSet(gencode) self.repo.commit() print("Done converting compliance data.", file=sys.stderr)
def run_nuc(args): """run occupancy calling """ vmat = VMat.open(args.vmat) if args.fasta: chrs = read_chrom_sizes_from_fasta(args.fasta) else: chrs = read_chrom_sizes_from_bam(args.bam) pwm = PWM.open(args.pwm) chunks = ChunkList.read(args.bed, chromDict=chrs, min_offset=vmat.mat.shape[1] + vmat.upper // 2 + max(pwm.up, pwm.down) + args.nuc_sep // 2, min_length=args.nuc_sep * 2) chunks.slop(chrs, up=args.nuc_sep // 2, down=args.nuc_sep // 2) chunks.merge() maxQueueSize = args.cores * 10 if args.sizes is not None: fragment_dist = FragmentSizes.open(args.sizes) else: fragment_dist = FragmentSizes(0, upper=vmat.upper) fragment_dist.calculateSizes(args.bam, chunks) params = NucParameters(vmat=vmat, fragmentsizes=fragment_dist, bam=args.bam, fasta=args.fasta, pwm=args.pwm, occ_track=args.occ_track, sd=args.sd, nonredundant_sep=args.nuc_sep, redundant_sep=args.redundant_sep, min_z=args.min_z, min_lr=args.min_lr, atac=args.atac) sets = chunks.split(items=args.cores * 5) pool1 = mp.Pool(processes=max(1, args.cores - 1)) if args.write_all: outputs = [ 'nucpos', 'nucpos.redundant', 'nucleoatac_signal', 'nucleoatac_signal.smooth', 'nucleoatac_background', 'nucleoatac_raw' ] else: outputs = [ 'nucpos', 'nucpos.redundant', 'nucleoatac_signal', 'nucleoatac_signal.smooth' ] handles = {} write_queues = {} write_processes = {} for i in outputs: if i not in ['nucpos', 'nucpos.redundant', 'nfrpos']: handles[i] = open(args.out + '.' + i + '.bedgraph', 'w') else: handles[i] = open(args.out + '.' + i + '.bed', 'w') handles[i].close() write_queues[i] = mp.JoinableQueue(maxsize=maxQueueSize) write_processes[i] = mp.Process(target=_writeFuncs[i], args=(write_queues[i], args.out)) write_processes[i].start() for j in sets: tmp = pool1.map(_nucHelper, list(zip(j, itertools.repeat(params)))) for result in tmp: for i in outputs: write_queues[i].put(result[i]) pool1.close() pool1.join() for i in outputs: write_queues[i].put('STOP') for i in outputs: write_processes[i].join() if i not in ['nucpos', 'nucpos.redundant']: pysam.tabix_compress(args.out + '.' + i + '.bedgraph', args.out + '.' + i + '.bedgraph.gz', force=True) shell_command('rm ' + args.out + '.' + i + '.bedgraph') pysam.tabix_index(args.out + '.' + i + '.bedgraph.gz', preset="bed", force=True) else: pysam.tabix_compress(args.out + '.' + i + '.bed', args.out + '.' + i + '.bed.gz', force=True) shell_command('rm ' + args.out + '.' + i + '.bed') pysam.tabix_index(args.out + '.' + i + '.bed.gz', preset="bed", force=True)
def testCompression(self): '''see also issue 106''' pysam.tabix_compress(self.tmpfilename, self.tmpfilename + ".gz") checkBinaryEqual(self.tmpfilename, self.tmpfilename + ".gz")