Ejemplo n.º 1
0
def _create_tabix(fname, ftype):
    logger = logging.getLogger("pita")
    tabix_file = ""
    logger.info("Creating tabix index for %s", os.path.basename(fname))
    logger.debug("Preparing %s for tabix", fname)
    tmp = NamedTemporaryFile(prefix="pita", delete=False)
    preset = "gff"
    if ftype == "bed":
        cmd = "sort -k1,1 -k2g,2 {0} | grep -v track | grep -v \"^#\" > {1}"
        preset = "bed"
    elif ftype in ["gff", "gff3", "gtf"]:
        cmd = "sort -k1,1 -k4g,4 {0} | grep -v \"^#\" > {1}"

    # Sort the input file
    logger.debug(cmd.format(fname, tmp.name))
    sp.call(cmd.format(fname, tmp.name), shell=True)
    # Compress using bgzip
    logger.debug("compressing %s", tmp.name)
    tabix_file = tmp.name + ".gz"
    pysam.tabix_compress(tmp.name, tabix_file)
    tmp.close()
    # Index (using tabix command line, as pysam.index results in a Segmentation fault
    logger.debug("indexing %s", tabix_file)
    sp.call("tabix {0} -p {1}".format(tabix_file, preset), shell=True)
    return tabix_file
Ejemplo n.º 2
0
    def annotate_vcf(self, inVcf, genome, outVcf, JVMmemory=None):
        """
        Annotate variants in VCF file with translation consequences using snpEff.
        """
        if outVcf.endswith('.vcf.gz'):
            tmpVcf = util.file.mkstempfname(prefix='vcf_snpEff-', suffix='.vcf')
        elif outVcf.endswith('.vcf'):
            tmpVcf = outVcf
        else:
            raise Exception("invalid input")

        args = [
            '-treatAllAsProteinCoding', 'false',
            '-t',
            '-noLog',
            '-ud', '0',
            '-noStats',
            '-noShiftHgvs',
            genome,
            inVcf
            ]
        with open(tmpVcf, 'wt') as outf:
            self.execute('ann', args, JVMmemory=JVMmemory, stdout=outf)
        
        if outVcf.endswith('.vcf.gz'):
            pysam.tabix_compress(tmpVcf, outVcf, force=True)
            pysam.tabix_index(outVcf, force=True, preset='vcf')
            os.unlink(tmpVcf)
Ejemplo n.º 3
0
def main():
    # Read options, args.
    parser = optparse.OptionParser()
    parser.add_option('-c', '--chr-col', type='int', dest='chrom_col')
    parser.add_option('-s', '--start-col', type='int', dest='start_col')
    parser.add_option('-e', '--end-col', type='int', dest='end_col')
    parser.add_option('-P', '--preset', dest='preset')
    (options, args) = parser.parse_args()
    input_fname, output_fname = args

    tmpfile = tempfile.NamedTemporaryFile()
    sort_params = None

    if options.chrom_col and options.start_col and options.end_col:
        sort_params = [
            "sort",
            "-k%(i)s,%(i)s" % {'i': options.chrom_col},
            "-k%(i)i,%(i)in" % {'i': options.start_col},
            "-k%(i)i,%(i)in" % {'i': options.end_col}
        ]
    elif options.preset == "bed":
        sort_params = ["sort", "-k1,1", "-k2,2n", "-k3,3n"]
    elif options.preset == "vcf":
        sort_params = ["sort", "-k1,1", "-k2,2n"]
    elif options.preset == "gff":
        sort_params = ["sort", "-s", "-k1,1", "-k4,4n"]  # stable sort on start column
    # Skip any lines starting with "#" and "track"
    grepped = subprocess.Popen(["grep", "-e", "^\"#\"", "-e", "^track", "-v", input_fname], stderr=subprocess.PIPE, stdout=subprocess.PIPE)
    after_sort = subprocess.Popen(sort_params, stdin=grepped.stdout, stderr=subprocess.PIPE, stdout=tmpfile)
    grepped.stdout.close()
    output, err = after_sort.communicate()

    pysam.tabix_compress(tmpfile.name, output_fname, force=True)
Ejemplo n.º 4
0
def main():
    # Read options, args.
    parser = optparse.OptionParser()
    (options, args) = parser.parse_args()
    input_fname, output_fname = args

    pysam.tabix_compress(input_fname, output_fname, force=True)
Ejemplo n.º 5
0
    def testIndexPresetCompressed(self):
        '''test indexing via preset.'''

        pysam.tabix_compress(self.tmpfilename, self.tmpfilename + ".gz")
        pysam.tabix_index(self.tmpfilename + ".gz", preset=self.preset)
        checkBinaryEqual(self.tmpfilename + ".gz", self.filename)
        checkBinaryEqual(self.tmpfilename + ".gz.tbi", self.filename_idx)
Ejemplo n.º 6
0
def make_bias_track(args, bases = 500000, splitsize = 1000):
    """function to compute bias track

    """
    if args.out is None:
        if args.bed is not None:
            args.out = '.'.join(os.path.basename(args.bed).split('.')[0:-1])
        else:
            args.out = '.'.join(os.path.basename(args.fasta).split('.')[0:-1])
    params = _BiasParams(args.fasta, args.pwm)
    if args.bed is None:
        chunks = ChunkList.convertChromSizes(params.chrs, splitsize = splitsize)
        sets = chunks.split(items = bases/splitsize)
    else:
        chunks = ChunkList.read(args.bed)
        chunks.merge()
        sets = chunks.split(bases = bases)
    maxQueueSize = max(2,int(2 * bases / np.mean([chunk.length() for chunk in chunks])))
    pool = mp.Pool(processes = max(1,args.cores-1))
    out_handle = open(args.out + '.Scores.bedgraph','w')
    out_handle.close()
    write_queue = mp.JoinableQueue(maxsize = maxQueueSize)
    write_process = mp.Process(target = _writeBias, args=(write_queue, args.out))
    write_process.start()
    for j in sets:
        tmp = pool.map(_biasHelper, zip(j,itertools.repeat(params)))
        for track in tmp:
            write_queue.put(track)
    pool.close()
    pool.join()
    write_queue.put('STOP')
    write_process.join()
    pysam.tabix_compress(args.out + '.Scores.bedgraph', args.out + '.Scores.bedgraph.gz', force = True)
    shell_command('rm ' + args.out + '.Scores.bedgraph')
    pysam.tabix_index(args.out + '.Scores.bedgraph.gz', preset = "bed", force = True)
Ejemplo n.º 7
0
def ensureIndexed(bedPath, preset="bed", trySorting=True):
    if not bedPath.endswith(".gz"):
        if not os.path.exists(bedPath + ".gz"):
            logging.info("bgzf compressing {}".format(bedPath))
            pysam.tabix_compress(bedPath, bedPath + ".gz")
            if not os.path.exists(bedPath + ".gz"):
                raise Exception(
                    "Failed to create compress {preset} file for {file}; make sure the {preset} file is "
                    "sorted and the directory is writeable".format(preset=preset, file=bedPath)
                )
        bedPath += ".gz"
    if not os.path.exists(bedPath + ".tbi"):
        logging.info("creating tabix index for {}".format(bedPath))
        pysam.tabix_index(bedPath, preset=preset)
        if not os.path.exists(bedPath + ".tbi"):
            raise Exception(
                "Failed to create tabix index file for {file}; make sure the {preset} file is "
                "sorted and the directory is writeable".format(preset=preset, file=bedPath)
            )

    line = pysam.Tabixfile(bedPath).fetch().next()
    if len(line.strip().split("\t")) < 6 and preset == "bed":
        raise AnnotationError(
            "BED files need to have at least 6 (tab-delimited) fields (including "
            "chrom, start, end, name, score, strand; score is unused)"
        )
    if len(line.strip().split("\t")) < 9 and preset == "gff":
        raise AnnotationError("GFF/GTF files need to have at least 9 tab-delimited fields")

    return bedPath
Ejemplo n.º 8
0
def get_cov(args, bases = 50000, splitsize = 1000):
    """function to get coverages

    """
    if not args.out:
        if args.bed is None:
            args.out = '.'.join(os.path.basename(args.bam).split('.')[0:-1])
        else:
            args.out = '.'.join(os.path.basename(args.bed).split('.')[0:-1])
    if args.bed is None:
        chrs = read_chrom_sizes_from_bam(args.bam)
        chunks = ChunkList.convertChromSizes(chrs, splitsize = splitsize)
        sets = chunks.split(items = bases/splitsize)
    else:
        chunks = ChunkList.read(args.bed)
        chunks.merge()
        sets = chunks.split(bases = bases)
    maxQueueSize = max(2,int(2 * bases / np.mean([chunk.length() for chunk in chunks])))
    pool1 = mp.Pool(processes = max(1,args.cores-1))
    out_handle = open(args.out + '.cov.bedgraph','w')
    out_handle.close()
    write_queue = mp.JoinableQueue(maxsize = maxQueueSize)
    write_process = mp.Process(target = _writeCov, args=(write_queue, args.out))
    write_process.start()
    for j in sets:
        tmp = pool1.map(_covHelper, zip(j,itertools.repeat(args)))
        for track in tmp:
            write_queue.put(track)
    pool1.close()
    pool1.join()
    write_queue.put('STOP')
    write_process.join()
    pysam.tabix_compress(args.out + '.cov.bedgraph', args.out + '.cov.bedgraph.gz', force = True)
    shell_command('rm ' + args.out + '.cov.bedgraph')
    pysam.tabix_index(args.out + '.cov.bedgraph.gz', preset = "bed", force = True)
Ejemplo n.º 9
0
    def eff_vcf(self, inVcf, outVcf, genome, java_flags='-Xmx2g',
            in_format='vcf', out_format='vcf', eff_options=''):
        """
        TODO: docstring here
        """
        if outVcf.endswith('.vcf.gz'):
            tmpVcf = util.file.mkstempfname(prefix='vcf_snpEff-', suffix='.vcf')
        else:
            tmpVcf = outVcf

        args = ' '.join([
                'eff',
                    '-c', '{}/snpEff.config'.format(self.executable_path()),
                    '-i', in_format,
                    '-o', out_format,
                    genome,
                    '-treatAllAsProteinCoding false',
                    '-noLog',
                    '-ud 0',
                    '-noStats',
                    eff_options
                ])

        if inVcf.endswith('.gz'):
            pre_pipe = "zcat {} | ".format(inVcf)
        else:
            pre_pipe = "cat {} | ".format(inVcf)
        post_pipe = " > {}".format(tmpVcf)
        self.execute(args, java_flags=java_flags, pre_pipe=pre_pipe,
                post_pipe=post_pipe)
        
        if outVcf.endswith('.vcf.gz'):
            pysam.tabix_compress(tmpVcf, outVcf, force=True)
            pysam.tabix_index(outVcf, force=True, preset='vcf')
            os.unlink(tmpVcf)
Ejemplo n.º 10
0
    def annotate_vcf(self, inVcf, genomes, outVcf, emailAddress, JVMmemory=None):
        """
        Annotate variants in VCF file with translation consequences using snpEff.
        """
        if outVcf.endswith('.vcf.gz'):
            tmpVcf = util.file.mkstempfname(prefix='vcf_snpEff-', suffix='.vcf')
        elif outVcf.endswith('.vcf'):
            tmpVcf = outVcf
        else:
            raise Exception("invalid input")

        sortedAccessionString = ", ".join(sorted(genomes))
        databaseId = hashlib.sha256(sortedAccessionString.encode('utf-8')).hexdigest()[:55]

        genomeToUse = ""

        # if we don't have the genome, by name (snpEff official) or by hash (custom)
        if (not self.has_genome(databaseId)):
            if (not self.has_genome(genomes[0])):
                _log.info("Checking for snpEff database online...")
                # check to see if it is available for download, and if so install it
                for row in self.available_databases():
                    if (genomes[0].lower() in row['Genome'].lower()) or (
                        genomes[0].lower() in row['Bundle'].lower()
                    ) or (
                        genomes[0].lower() in row['Organism'].lower()
                    ):
                        self.download_db(row['Genome'])

        # backward compatability for where a single genome name is provided
        if self.has_genome(genomes[0]):
            genomeToUse = genomes[0]
        else:
            # if the hash of the accessions passed in is not present in the genomes db
            if not self.has_genome(databaseId):
                self.create_db(genomes, emailAddress, JVMmemory)

            if self.has_genome(databaseId):
                genomeToUse = databaseId

        if not genomeToUse:
            raise Exception()

        args = [
            '-treatAllAsProteinCoding', 'false', '-t', '-noLog', '-ud', '0', '-noStats', '-noShiftHgvs', genomeToUse,
            os.path.realpath(inVcf)
        ]

        command_ps = self.execute('ann', args, JVMmemory=JVMmemory)
        if command_ps.returncode == 0:
            with open(tmpVcf, 'wt') as outf:
               outf.write(command_ps.stdout.decode("utf-8"))

            if outVcf.endswith('.vcf.gz'):
                pysam.tabix_compress(tmpVcf, outVcf, force=True)
                pysam.tabix_index(outVcf, force=True, preset='vcf')
                os.unlink(tmpVcf)
        else:
            raise subprocess.CalledProcessError(cmd=command_ps.args, returncode=command_ps.returncode, output=command_ps.stdout)
Ejemplo n.º 11
0
    def annotate_vcf(self, inVcf, genomes, outVcf, emailAddress, JVMmemory=None):
        """
        Annotate variants in VCF file with translation consequences using snpEff.
        """
        if outVcf.endswith('.vcf.gz'):
            tmpVcf = util.file.mkstempfname(prefix='vcf_snpEff-', suffix='.vcf')
        elif outVcf.endswith('.vcf'):
            tmpVcf = outVcf
        else:
            raise Exception("invalid input")

        sortedAccessionString = ", ".join([util.genbank.parse_accession_str(acc) for acc in sorted(genomes)])
        databaseId = hashlib.sha256(sortedAccessionString.encode('utf-8')).hexdigest()[:55]

        genomeToUse = ""

        # if we don't have the genome, by name (snpEff official) or by hash (custom)
        if (not self.has_genome(databaseId)):
            if (not self.has_genome(genomes[0])):
                _log.info("Checking for snpEff database online...")
                # check to see if it is available for download, and if so install it
                for row in self.available_databases():
                    if (genomes[0].lower() in row['Genome'].lower()) or (
                        genomes[0].lower() in row['Bundle'].lower()
                    ) or (
                        genomes[0].lower() in row['Organism'].lower()
                    ):
                        self.download_db(row['Genome'])

        # backward compatability for where a single genome name is provided
        if self.has_genome(genomes[0]):
            genomeToUse = genomes[0]
        else:
            # if the hash of the accessions passed in is not present in the genomes db
            if not self.has_genome(databaseId):
                self.create_db(genomes, emailAddress, JVMmemory)

            if self.has_genome(databaseId):
                genomeToUse = databaseId

        if not genomeToUse:
            raise Exception()

        args = [
            '-treatAllAsProteinCoding', 'false', '-t', '-noLog', '-ud', '0', '-noStats', '-noShiftHgvs', genomeToUse,
            os.path.realpath(inVcf)
        ]

        command_ps = self.execute('ann', args, JVMmemory=JVMmemory)
        if command_ps.returncode == 0:
            with open(tmpVcf, 'wt') as outf:
               outf.write(command_ps.stdout.decode("utf-8"))

            if outVcf.endswith('.vcf.gz'):
                pysam.tabix_compress(tmpVcf, outVcf, force=True)
                pysam.tabix_index(outVcf, force=True, preset='vcf')
                os.unlink(tmpVcf)
        else:
            raise subprocess.CalledProcessError(cmd=command_ps.args, returncode=command_ps.returncode, output=command_ps.stdout)
Ejemplo n.º 12
0
    def testEmptyFileVCFGZWithoutIndex(self):
        with get_temp_context("tmp_testEmptyFileWithoutIndex.vcf") as fn:
            with open(fn, "w"):
                pass

            pysam.tabix_compress(fn, fn + ".gz", force=True)

            self.assertRaises(ValueError, pysam.VariantFile, fn + ".gz")
Ejemplo n.º 13
0
def indexFile(input_file):
    sys.stdout.write('Compressing file... ')
    sys.stdout.flush()
    pysam.tabix_compress(input_file, input_file + '.gz', force=True)
    sys.stdout.write('OK\n')
    sys.stdout.write('Indexing output file... ')
    sys.stdout.flush()
    pysam.tabix_index(input_file + '.gz', seq_col=4, start_col=6, end_col=7, meta_char='#', force=True)
    sys.stdout.write('OK\n')
Ejemplo n.º 14
0
def run_nfr(args):
    """run nfr calling

    """
    if args.bam is None and args.ins_track is None:
        raise Exception("Must supply either bam file or insertion track")
    if not args.out:
        args.out = '.'.join(os.path.basename(args.calls).split('.')[0:-3])
    if args.fasta is not None:
        chrs_fasta = read_chrom_sizes_from_fasta(args.fasta)
        pwm = PWM.open(args.pwm)
        chunks = ChunkList.read(args.bed, chromDict = chrs_fasta, min_offset = max(pwm.up, pwm.down))
    else:
        chunks = ChunkList.read(args.bed)
    if args.bam is not None:
        chrs_bam = read_chrom_sizes_from_bam(args.bam)
        chunks.checkChroms(chrs_bam, chrom_source = "BAM file") 
    chunks.merge()
    maxQueueSize = args.cores * 10 
    params = NFRParameters(args.occ_track, args.calls, args.ins_track, args.bam, max_occ = args.max_occ, max_occ_upper = args.max_occ_upper,
                            fasta = args.fasta, pwm = args.pwm)
    sets = chunks.split(items = args.cores * 5)
    pool1 = mp.Pool(processes = max(1,args.cores-1))
    nfr_handle = open(args.out + '.nfrpos.bed','w')
    nfr_handle.close()
    nfr_queue = mp.JoinableQueue()
    nfr_process = mp.Process(target = _writeNFR, args=(nfr_queue, args.out))
    nfr_process.start()
    if params.ins_track is None:
        ins_handle = open(args.out + '.ins.bedgraph','w')
        ins_handle.close()
        ins_queue = mp.JoinableQueue()
        ins_process = mp.Process(target = _writeIns, args=(ins_queue, args.out))
        ins_process.start()
    for j in sets:
        tmp = pool1.map(_nfrHelper, zip(j,itertools.repeat(params)))
        for result in tmp:
            if params.ins_track is None:
                nfr_queue.put(result[0])
                ins_queue.put(result[1])
            else:
                nfr_queue.put(result)
    pool1.close()
    pool1.join()
    nfr_queue.put('STOP')
    nfr_process.join()
    if params.ins_track is None:
        ins_queue.put('STOP')
        ins_process.join()
    pysam.tabix_compress(args.out + '.nfrpos.bed', args.out + '.nfrpos.bed.gz',force = True)
    shell_command('rm ' + args.out + '.nfrpos.bed')
    pysam.tabix_index(args.out + '.nfrpos.bed.gz', preset = "bed", force = True)
    if params.ins_track is None:
        pysam.tabix_compress(args.out + '.ins.bedgraph', args.out + '.ins.bedgraph.gz', force = True)
        shell_command('rm ' + args.out + '.ins.bedgraph')
        pysam.tabix_index(args.out + '.ins.bedgraph.gz', preset = "bed", force = True)
Ejemplo n.º 15
0
def indexFile(f, options):
    sys.stdout.write(f'Compressing output file {f}... ')
    sys.stdout.flush()
    pysam.tabix_compress(os.path.join(options.output_dir, f), os.path.join(options.output_dir, f + '.gz'), force=True)
    sys.stdout.write('OK\n')
    sys.stdout.write(f'Indexing output file {f}... ')
    sys.stdout.flush()
    pysam.tabix_index(os.path.join(options.output_dir, f + '.gz'), seq_col=4, start_col=6, end_col=7, meta_char='#',
                      force=True)
    sys.stdout.write('OK\n')
Ejemplo n.º 16
0
    def _index_with_tabix(self):
        """Compress and index output file by Tabix"""

        pysam.tabix_compress(self._fn + '_tmp', self._fn + '.gz', force=True)
        pysam.tabix_index(self._fn + '.gz',
                          seq_col=self.idx_chrom,
                          start_col=self.idx_start,
                          end_col=self.idx_end,
                          meta_char='#',
                          force=True)
Ejemplo n.º 17
0
    def testEmptyFileVCFGZWithoutIndex(self):
        with get_temp_context("tmp_testEmptyFileWithoutIndex.vcf") as fn:
            with open(fn, "w"):
                pass

            pysam.tabix_compress(fn,
                                 fn + ".gz",
                                 force=True)

            self.assertRaises(ValueError, pysam.VariantFile, fn + ".gz")
Ejemplo n.º 18
0
def convert_VariantFile_to_IndexedVariantFile(vf_path, ivf_path):
    make_basedir(ivf_path)
    tmp_path = get_tmp_path(ivf_path)
    pysam.tabix_compress(vf_path, tmp_path, force=True)
    os.rename(tmp_path, ivf_path)

    pysam.tabix_index(
        filename=ivf_path, force=True,
        seq_col=0, start_col=1, end_col=1, # note: `pysam.tabix_index` calls the first column `0`, but cmdline `tabix` calls it `1`.
        line_skip=1, # skip header
    )
Ejemplo n.º 19
0
def convert_VariantFile_to_IndexedVariantFile(vf_path, ivf_path):
    make_basedir(ivf_path)
    tmp_path = get_tmp_path(ivf_path)
    pysam.tabix_compress(vf_path, tmp_path, force=True)
    os.rename(tmp_path, ivf_path)

    pysam.tabix_index(
        filename=ivf_path, force=True,
        seq_col=0, start_col=1, end_col=1, # note: `pysam.tabix_index` calls the first column `0`, but cmdline `tabix` calls it `1`.
        line_skip=1, # skip header
    )
Ejemplo n.º 20
0
    def testEmptyFileVCFGZ(self):
        with open("tmp_testEmptyFile.vcf", "w"):
            pass

        pysam.tabix_compress("tmp_testEmptyFile.vcf",
                             "tmp_testEmptyFile.vcf.gz")

        self.assertRaises(ValueError, pysam.VariantFile,
                          "tmp_testEmptyFile.vcf.gz")

        os.unlink("tmp_testEmptyFile.vcf")
        os.unlink("tmp_testEmptyFile.vcf.gz")
Ejemplo n.º 21
0
    def testEmptyFileVCFGZ(self):
        with open("tmp_testEmptyFile.vcf", "w"):
            pass

        pysam.tabix_compress("tmp_testEmptyFile.vcf",
                             "tmp_testEmptyFile.vcf.gz")

        self.assertRaises(ValueError, pysam.VariantFile,
                          "tmp_testEmptyFile.vcf.gz")

        os.unlink("tmp_testEmptyFile.vcf")
        os.unlink("tmp_testEmptyFile.vcf.gz")
Ejemplo n.º 22
0
def bamTobed(bamInput=None, bedOutput=None, compress=True):
    # generate temp file for sorting and indexing
    bedOutput_path = os.path.realpath(bedOutput)
    this_pid = os.getpid()
    tmp_split = os.path.splitext(bedOutput_path)
    tmp_bedOutput = tmp_split[0] + "-temp-" + str(this_pid) + tmp_split[1]

    bai = bamInput + ".bai"
    if not os.path.exists(bai):
        message = "Index file " + bai + " do not exist!"
        raise commonError(message)

    bedWrite = open(tmp_bedOutput, "w")

    input_file = pysam.Samfile(bamInput, "rb")
    chr_reference = input_file.references
    for read1, read2 in read_pair_generator(input_file):
        read1Start = read1.reference_start
        read1End = read1.reference_end
        read2Start = read2.reference_start
        read2End = read2.reference_end

        if not read1.is_reverse:  # read1 is forward strand, read2 is reverse strand
            rstart = read1Start  # 0-based left-most site
            rend = read2End
        else:  # read1 is reverse strand, read2 is forward strand
            rstart = read2Start  # 0-based left-most site
            rend = read1End

        if (rstart < 0) or (rend < 0) or (rstart >= rend): continue

        tmp_str = chr_reference[read1.tid] + "\t" + str(rstart) + "\t" + str(
            rend) + "\n"
        bedWrite.write(tmp_str)

    bedWrite.close()
    print("Fragments generated, waitting for sorting......")

    bedData = pybedtools.BedTool(tmp_bedOutput)
    bedData.sort(output=bedOutput)

    os.remove(tmp_bedOutput)

    print("Fragments sorted.")

    if compress:
        print("Waitting for compressing and indexing......")
        bedgzfile = bedOutput + ".gz"
        pysam.tabix_compress(bedOutput, bedgzfile, force=False)
        pysam.tabix_index(bedgzfile, preset="bed", zerobased=True)
        print("Indexing bedgz file finished!")

    return True
Ejemplo n.º 23
0
def run_merge(args):
    if not args.out:
        args.out = '.'.join(os.path.basename(args.nucpos).split('.')[0:-3])
    occ = NucList.read(args.occpeaks, "occ", args.min_occ)
    nuc = NucList.read(args.nucpos, "nuc", args.min_occ)
    new = merge(occ, nuc, args.sep)
    out = open(args.out + '.nucmap_combined.bed','w')
    out.write(new.asBed())
    out.close()
    pysam.tabix_compress(args.out + '.nucmap_combined.bed', args.out + '.nucmap_combined.bed.gz',force = True)
    shell_command('rm ' + args.out + '.nucmap_combined.bed')
    pysam.tabix_index(args.out + '.nucmap_combined.bed.gz', preset = "bed", force = True)
Ejemplo n.º 24
0
def run_diff(args, bases=500000):
    """run differential occupancy calling

    """
    chrs = read_chrom_sizes_from_bam(args.bam)
    pwm = PWM.open(args.pwm)
    chunks = ChunkList.read(args.bed,
                            chromDict=chrs,
                            min_offset=args.flank + args.upper / 2 +
                            max(pwm.up, pwm.down))
    chunks.merge()
    maxQueueSize = max(
        2, int(100 * bases / np.mean([chunk.length() for chunk in chunks])))
    #get fragmentsizes
    fragment_dist1 = FragmentMixDistribution(0, upper=args.upper)
    fragment_dist1.fragmentsizes = FragmentSizes(
        0, args.upper, vals=FragmentSizes.open(args.sizes1).get(0, args.upper))
    fragment_dist1.modelNFR()
    fragment_dist2 = FragmentMixDistribution(0, upper=args.upper)
    fragment_dist2.fragmentsizes = FragmentSizes(
        0, args.upper, vals=FragmentSizes.open(args.sizes2).get(0, args.upper))
    fragment_dist2.modelNFR()
    params = OccupancyParameters(fragment_dist,
                                 args.upper,
                                 args.fasta,
                                 args.pwm,
                                 sep=args.nuc_sep,
                                 min_occ=args.min_occ,
                                 flank=args.flank,
                                 bam=args.bam,
                                 ci=args.confidence_interval)
    sets = chunks.split(bases=bases)
    pool1 = mp.Pool(processes=max(1, args.cores - 1))
    diff_handle = open(args.out + '.occdiff.bed', 'w')
    diff_handle.close()
    diff_queue = mp.JoinableQueue()
    diff_process = mp.Process(target=_writeDiff, args=(diff_queue, args.out))
    diff_process.start()
    nuc_dist = np.zeros(args.upper)
    for j in sets:
        tmp = pool1.map(_occHelper, zip(j, itertools.repeat(params)))
        for result in tmp:
            diff_queue.put(result[1])
    pool1.close()
    pool1.join()
    diff_queue.put('STOP')
    diff_process.join()
    pysam.tabix_compress(args.out + '.occdiff.bed',
                         args.out + '.occdiff.bed.gz',
                         force=True)
    shell_command('rm ' + args.out + '.occdiff.bed')
    pysam.tabix_index(args.out + '.occdiff.bed.gz', preset="bed", force=True)
Ejemplo n.º 25
0
    def testEmptyFileVCFGZWithoutIndex(self):
        with open("tests/tmp_testEmptyFileWithoutIndex.vcf", "w"):
            pass

        pysam.tabix_compress("tests/tmp_testEmptyFileWithoutIndex.vcf",
                             "tests/tmp_testEmptyFileWithoutIndex.vcf.gz",
                             force=True)

        self.assertRaises(ValueError, pysam.VariantFile,
                          "tests/tmp_testEmptyFileWithoutIndex.vcf.gz")

        os.unlink("tests/tmp_testEmptyFileWithoutIndex.vcf")
        os.unlink("tests/tmp_testEmptyFileWithoutIndex.vcf.gz")
Ejemplo n.º 26
0
def bgzip_file(original_file, new_file, delete_original=False, force=True):
    """

    :param original_file:
    :param new_file:
    :param force:
    :return:
    """

    pysam.tabix_compress(original_file, new_file, force)

    if delete_original:
        delete_file(original_file)
Ejemplo n.º 27
0
    def testEmptyFileVCFGZWithoutIndex(self):
        with open("tmp_testEmptyFileWithoutIndex.vcf", "w"):
            pass

        pysam.tabix_compress("tmp_testEmptyFileWithoutIndex.vcf",
                             "tmp_testEmptyFileWithoutIndex.vcf.gz",
                             force=True)

        self.assertRaises(ValueError, pysam.VariantFile,
                          "tmp_testEmptyFileWithoutIndex.vcf.gz")

        os.unlink("tmp_testEmptyFileWithoutIndex.vcf")
        os.unlink("tmp_testEmptyFileWithoutIndex.vcf.gz")
Ejemplo n.º 28
0
def indexFile(options):
    sys.stdout.write('Compressing output file ... ')
    sys.stdout.flush()
    pysam.tabix_compress(options.output, options.output + '.gz', force=True)
    sys.stdout.write('OK\n')
    sys.stdout.write('Indexing output file ... ')
    sys.stdout.flush()
    pysam.tabix_index(options.output + '.gz',
                      seq_col=1,
                      start_col=2,
                      end_col=2,
                      meta_char='#',
                      force=True)
    sys.stdout.write('OK\n')
Ejemplo n.º 29
0
def run_nfr(args):
    """run nfr calling

    """
    if args.bam is None and args.ins_track is None:
        raise Exception("Must supply either bam file or insertion track")
    if not args.out:
        args.out = '.'.join(os.path.basename(args.calls).split('.')[0:-3])
    chunks = ChunkList.read(args.bed)
    chunks.merge()
    maxQueueSize = args.cores * 10 
    params = NFRParameters(args.occ_track, args.calls, args.ins_track, args.bam, max_occ = args.max_occ, max_occ_upper = args.max_occ_upper,
                            fasta = args.fasta, pwm = args.pwm)
    sets = chunks.split(items = args.cores * 5)
    pool1 = mp.Pool(processes = max(1,args.cores-1))
    nfr_handle = open(args.out + '.nfrpos.bed','w')
    nfr_handle.close()
    nfr_queue = mp.JoinableQueue()
    nfr_process = mp.Process(target = _writeNFR, args=(nfr_queue, args.out))
    nfr_process.start()
    if params.ins_track is None:
        ins_handle = open(args.out + '.ins.bedgraph','w')
        ins_handle.close()
        ins_queue = mp.JoinableQueue()
        ins_process = mp.Process(target = _writeIns, args=(ins_queue, args.out))
        ins_process.start()
    for j in sets:
        tmp = pool1.map(_nfrHelper, zip(j,itertools.repeat(params)))
        for result in tmp:
            if params.ins_track is None:
                nfr_queue.put(result[0])
                ins_queue.put(result[1])
            else:
                nfr_queue.put(result)
    pool1.close()
    pool1.join()
    nfr_queue.put('STOP')
    nfr_process.join()
    if params.ins_track is None:
        ins_queue.put('STOP')
        ins_process.join()
    pysam.tabix_compress(args.out + '.nfrpos.bed', args.out + '.nfrpos.bed.gz',force = True)
    shell_command('rm ' + args.out + '.nfrpos.bed')
    pysam.tabix_index(args.out + '.nfrpos.bed.gz', preset = "bed", force = True)
    if params.ins_track is None:
        pysam.tabix_compress(args.out + '.ins.bedgraph', args.out + '.ins.bedgraph.gz', force = True)
        shell_command('rm ' + args.out + '.ins.bedgraph')
        pysam.tabix_index(args.out + '.ins.bedgraph.gz', preset = "bed", force = True)
Ejemplo n.º 30
0
def run_merge(args):
    if not args.out:
        args.out = '.'.join(os.path.basename(args.nucpos).split('.')[0:-3])
    occ = NucList.read(args.occpeaks, "occ", args.min_occ)
    nuc = NucList.read(args.nucpos, "nuc", args.min_occ)
    new = merge(occ, nuc, args.sep)
    out = open(args.out + '.nucmap_combined.bed', 'w')
    out.write(new.asBed())
    out.close()
    pysam.tabix_compress(args.out + '.nucmap_combined.bed',
                         args.out + '.nucmap_combined.bed.gz',
                         force=True)
    shell_command('rm ' + args.out + '.nucmap_combined.bed')
    pysam.tabix_index(args.out + '.nucmap_combined.bed.gz',
                      preset="bed",
                      force=True)
Ejemplo n.º 31
0
def indexFile(options):
    filename=options.output
    if not options.ensembl is None:
        sys.stdout.write('Compressing output file... ') 
        sys.stdout.flush()
        pysam.tabix_compress(filename,filename+'.gz',force=True)
        sys.stdout.write('OK\n') 	
        sys.stdout.write('Indexing output file... ') 
        sys.stdout.flush()
        pysam.tabix_index(filename+'.gz', seq_col=2, start_col=4, end_col=5, meta_char='#',force=True)
        sys.stdout.write('OK\n')
    else:
        print 'Compressing file...'
        pysam.tabix_compress(filename,filename+'.gz',force=True)
        print 'Indexing file...'
        pysam.tabix_index(filename+'.gz', seq_col=1, start_col=2, end_col=2, meta_char='#',force=True)
Ejemplo n.º 32
0
def indexFile(f):
    sys.stdout.write(f'Compressing output file {f}... ')
    sys.stdout.flush()
    assert os.path.exists(f), f"{f} does not exist"
    pysam.tabix_compress(f, f + '.gz', force=True)
    sys.stdout.write('OK\n')
    if os.path.exists(f):
        os.remove(f)
    sys.stdout.write(f'Indexing output file {f}.gz... ')
    sys.stdout.flush()
    pysam.tabix_index(f + '.gz',
                      seq_col=4,
                      start_col=6,
                      end_col=7,
                      meta_char='#',
                      force=True)
    sys.stdout.write('OK\n')
Ejemplo n.º 33
0
def bgzip_index(original_file, new_file, file_format):
    """

    :param original_file:
    :param new_file:
    :param file_format:
    :return:
    """

    if file_format.lower() == 'fa':
        tabix_compress(original_file, new_file)
        faidx(new_file)
        delete_file(original_file)
    elif file_format.lower() == 'vcf':
        tabix_index(original_file, preset="vcf", force=True)
    else:
        raise G2GValueError("Unknown file format: {0}".format(file_format))
Ejemplo n.º 34
0
def bgzip_index(original_file, new_file, file_format):
    """

    :param original_file:
    :param new_file:
    :param file_format:
    :return:
    """

    if file_format.lower() == 'fa':
        tabix_compress(original_file, new_file)
        faidx(new_file)
        delete_file(original_file)
    elif file_format.lower() == 'vcf':
        tabix_index(original_file, preset="vcf", force=True)
    else:
        raise G2GValueError("Unknown file format: {0}".format(file_format))
Ejemplo n.º 35
0
def main():
    # Read options, args.
    parser = optparse.OptionParser()
    parser.add_option('-c', '--chr-col', type='int', dest='chrom_col')
    parser.add_option('-s', '--start-col', type='int', dest='start_col')
    parser.add_option('-e', '--end-col', type='int', dest='end_col')
    parser.add_option('-P', '--preset', dest='preset')
    (options, args) = parser.parse_args()
    input_fname, output_fname = args

    tmpfile = tempfile.NamedTemporaryFile()
    sort_params = None

    if options.chrom_col and options.start_col and options.end_col:
        sort_params = [
            "sort",
            "-k%(i)s,%(i)s" % {
                'i': options.chrom_col
            },
            "-k%(i)i,%(i)in" % {
                'i': options.start_col
            },
            "-k%(i)i,%(i)in" % {
                'i': options.end_col
            }
        ]
    elif options.preset == "bed":
        sort_params = ["sort", "-k1,1", "-k2,2n", "-k3,3n"]
    elif options.preset == "vcf":
        sort_params = ["sort", "-k1,1", "-k2,2n"]
    elif options.preset == "gff":
        sort_params = ["sort", "-s", "-k1,1",
                       "-k4,4n"]  # stable sort on start column
    # Skip any lines starting with "#" and "track"
    grepped = subprocess.Popen(
        ["grep", "-e", "^\"#\"", "-e", "^track", "-v", input_fname],
        stderr=subprocess.PIPE,
        stdout=subprocess.PIPE)
    after_sort = subprocess.Popen(sort_params,
                                  stdin=grepped.stdout,
                                  stderr=subprocess.PIPE,
                                  stdout=tmpfile)
    grepped.stdout.close()
    output, err = after_sort.communicate()

    pysam.tabix_compress(tmpfile.name, output_fname, force=True)
Ejemplo n.º 36
0
def bgzip(in_fn, remove = True):
    """
    convert file to bgzipped format
    """
    if is_gz_file(in_fn):
      tmp_out_fn = in_fn.replace(".gz", "")
      out_fn = in_fn.replace(".gz", ".bgz")
      ungzip(in_fn, tmp_out_fn)
    else:
      tmp_out_fn = in_fn
      out_fn = in_fn + ".bgz"
    
    pysam.tabix_compress(tmp_out_fn, out_fn, force = True)
    if remove:
      os.unlink(tmp_out_fn)
    
    return out_fn
Ejemplo n.º 37
0
    def _make_assembly_vcf(self):
        tmp_vcf = self.final_assembly_vcf + '.tmp'
        cmd = ' '.join([
            self.samtools_exe, 'mpileup',
            '-t INFO/DPR,DV',
            '-A',
            '-f', self.final_assembly_fa,
            '-u',
            '-v',
            self.final_assembly_bam,
            '>',
            tmp_vcf
        ])

        common.syscall(cmd, verbose=self.verbose)

        cmd = ' '.join([
            self.bcftools_exe, 'call -m',
            tmp_vcf,
            '|',
            self.bcftools_exe, 'query',
            r'''-f '%CHROM\t%POS\t%REF\t%ALT\t%DP\t%DPR]\n' ''',
            '>',
            self.final_assembly_read_depths + '.tmp'
        ])

        common.syscall(cmd, verbose=self.verbose)
        pysam.tabix_compress(self.final_assembly_read_depths + '.tmp', self.final_assembly_read_depths)
        pysam.tabix_index(self.final_assembly_read_depths, seq_col=0, start_col=1, end_col=1)
        os.unlink(self.final_assembly_read_depths + '.tmp')

        cmd = ' '.join([
            self.bcftools_exe, 'call -m -v',
            tmp_vcf,
            '|',
            self.bcftools_exe, 'filter',
            '-i', '"MIN(DP)>=' + str(self.bcf_min_dp),
                  ' & MIN(DV)>=' + str(self.bcf_min_dv),
                  ' & MIN(DV/DP)>=' + str(self.bcf_min_dv_over_dp),
                  ' & QUAL >=', str(self.bcf_min_qual), '"',
            '-o', self.final_assembly_vcf
        ])

        common.syscall(cmd, verbose=self.verbose)
        os.unlink(tmp_vcf)
Ejemplo n.º 38
0
def main():
    # Read options, args.
    usage = "Usage: %prog [options] tabular_input_file bgzip_output_file"
    parser = optparse.OptionParser(usage=usage)
    parser.add_option('-c', '--chr-col', type='int', default=0, dest='chrom_col')
    parser.add_option('-s', '--start-col', type='int', default=1, dest='start_col')
    parser.add_option('-e', '--end-col', type='int', default=1, dest='end_col')
    (options, args) = parser.parse_args()
    if len(args) != 2:
        parser.print_usage()
        exit(1)
    input_fname, output_fname = args
    output_dir = os.path.dirname(output_fname)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    pysam.tabix_compress(input_fname, output_fname, force=True)
    # Column indices are 0-based.
    pysam.tabix_index(output_fname, seq_col=options.chrom_col, start_col=options.start_col, end_col=options.end_col)
Ejemplo n.º 39
0
 def __init__(self, fileName, samples):        
     self.samples = samples
     self.sampleIndexes = []
     self.nColumns = 0      
     
     # Compress with bgzip
     if not fileName.endswith('.gz'):
         if not os.path.isfile(fileName+'.gz'):
             pysam.tabix_compress(fileName, fileName+'.gz')
         fileName += '.gz'
     
     # Build tabix index
     if not os.path.isfile(fileName+'.tbi'): 
         pysam.tabix_index(fileName, preset='vcf')                                 
              
     nLines = 0        
     fp = gzip.open(fileName, 'r')
     line = fp.readline()        
     while line:
         nLines += 1          
         if line.startswith('##'):
             line = fp.readline()                                    
         elif line.startswith('#'):  # Header line
             break
         else:
             line = None        # Content line, no header line found
     else:
         raise ValueError("Header not found.")
     
     # Get the column index of selected samples        
     headers = line[1:].rstrip().split(FS)
     self.nColumns = len(headers)        
     if self.nColumns <= 9:
         raise ValueError("Not enough columns in header.")                
     
     for name in self.samples:
         if name in headers[9:]:
             self.sampleIndexes.append(headers.index(name))
         else:
             raise ValueError("Sample %s not found in header." % name)
     
     self.tabix = pysam.Tabixfile(fileName)
     self.chroms = self.tabix.contigs
     self.fileName = fileName
Ejemplo n.º 40
0
def convert_VariantFile_to_IndexedVariantFile(vf_path: str,
                                              ivf_path: str) -> None:
    make_basedir(ivf_path)
    tmp_path = get_tmp_path(ivf_path)
    tmp_path = '{}/cvt-{}'.format(
        os.path.dirname(tmp_path), os.path.basename(
            tmp_path))  # Avoid using the same tmp path as augment-phenos
    pysam.tabix_compress(vf_path, tmp_path, force=True)
    os.rename(tmp_path, ivf_path)

    pysam.tabix_index(
        filename=ivf_path,
        force=True,
        seq_col=0,
        start_col=1,
        end_col=
        1,  # note: `pysam.tabix_index` calls the first column `0`, but cmdline `tabix` calls it `1`.
        line_skip=1,  # skip header
    )
Ejemplo n.º 41
0
def make_bias_track(args, bases=500000, splitsize=1000):
    """function to compute bias track

    """
    if args.out is None:
        if args.bed is not None:
            args.out = '.'.join(os.path.basename(args.bed).split('.')[0:-1])
        else:
            args.out = '.'.join(os.path.basename(args.fasta).split('.')[0:-1])
    params = _BiasParams(args.fasta, args.pwm)

    if args.bed is None:
        chunks = ChunkList.convertChromSizes(params.chrs, splitsize=splitsize)
        sets = chunks.split(items=bases // splitsize)
    else:
        chunks = ChunkList.read(args.bed)
        chunks.checkChroms(list(params.chrs.keys()))
        chunks.merge()
        sets = chunks.split(bases=bases)

    maxQueueSize = max(
        2, int(2 * bases / np.mean([chunk.length() for chunk in chunks])))
    pool = mp.Pool(processes=max(1, args.cores - 1))
    out_handle = open(args.out + '.Scores.bedgraph', 'w')
    out_handle.close()
    write_queue = mp.JoinableQueue(maxsize=maxQueueSize)
    write_process = mp.Process(target=_writeBias, args=(write_queue, args.out))
    write_process.start()
    for j in sets:
        tmp = pool.map(_biasHelper, list(zip(j, itertools.repeat(params))))
        for track in tmp:
            write_queue.put(track)
    pool.close()
    pool.join()
    write_queue.put('STOP')
    write_process.join()
    pysam.tabix_compress(args.out + '.Scores.bedgraph',
                         args.out + '.Scores.bedgraph.gz',
                         force=True)
    shell_command('rm ' + args.out + '.Scores.bedgraph')
    pysam.tabix_index(args.out + '.Scores.bedgraph.gz',
                      preset="bed",
                      force=True)
Ejemplo n.º 42
0
def get_ins(args, bases=50000, splitsize=1000):
    """function to get insertions

    """
    if not args.out:
        if args.bed is None:
            args.out = '.'.join(os.path.basename(args.bam).split('.')[0:-1])
        else:
            args.out = '.'.join(os.path.basename(args.bed).split('.')[0:-1])
    if args.bed is None:
        chrs = read_chrom_sizes_from_bam(args.bam)
        chunks = ChunkList.convertChromSizes(chrs, splitsize=splitsize)
        sets = chunks.split(items=bases / splitsize)
    else:
        chunks = ChunkList.read(args.bed)
        chunks.merge()
        sets = chunks.split(bases=bases)
    maxQueueSize = max(
        2, int(2 * bases / np.mean([chunk.length() for chunk in chunks])))
    pool1 = mp.Pool(processes=max(1, args.cores - 1))
    out_handle = open(args.out + '.ins.bedgraph', 'w')
    out_handle.close()
    write_queue = mp.JoinableQueue(maxsize=maxQueueSize)
    write_process = mp.Process(target=_writeIns, args=(write_queue, args.out))
    write_process.start()
    for j in sets:
        if args.smooth:
            tmp = pool1.map(_insHelperSmooth,
                            list(zip(j, itertools.repeat(args))))
        else:
            tmp = pool1.map(_insHelper, list(zip(j, itertools.repeat(args))))
        for track in tmp:
            write_queue.put(track)
    pool1.close()
    pool1.join()
    write_queue.put('STOP')
    write_process.join()
    pysam.tabix_compress(args.out + '.ins.bedgraph',
                         args.out + '.ins.bedgraph.gz',
                         force=True)
    shell_command('rm ' + args.out + '.ins.bedgraph')
    pysam.tabix_index(args.out + '.ins.bedgraph.gz', preset="bed", force=True)
Ejemplo n.º 43
0
    def _make_vcf_and_read_depths_files(self):
        tmp_vcf = self.vcf_file + '.tmp'
        cmd = ' '.join([
            self.samtools_exe, 'mpileup',
            '-t INFO/AD',
            '-A',
            '-f', self.ref_fa,
            '-u',
            '-v',
            self.bam,
            '>',
            tmp_vcf
        ])

        common.syscall(cmd, verbose=True, verbose_filehandle=self.log_fh)

        cmd = ' '.join([
            self.bcftools_exe, 'call -m',
            tmp_vcf,
            '|',
            self.bcftools_exe, 'query',
            r'''-f '%CHROM\t%POS\t%REF\t%ALT\t%DP\t%AD]\n' ''',
            '>',
            self.read_depths_file + '.tmp'
        ])

        common.syscall(cmd, verbose=True, verbose_filehandle=self.log_fh)
        pysam.tabix_compress(self.read_depths_file + '.tmp', self.read_depths_file)
        pysam.tabix_index(self.read_depths_file, seq_col=0, start_col=1, end_col=1)
        os.unlink(self.read_depths_file + '.tmp')

        cmd = ' '.join([
            self.bcftools_exe, 'call -m -v',
            tmp_vcf,
            '|',
            self.bcftools_exe, 'filter',
            '-i', '"SUM(AD)>=5 & MIN(AD)/DP>=0.1"',
            '-o', self.vcf_file
        ])

        common.syscall(cmd, verbose=True, verbose_filehandle=self.log_fh)
        os.unlink(tmp_vcf)
Ejemplo n.º 44
0
 def compress_tabix(self, **kwargs):
     force = kwargs.get('force', False)
     timing = kwargs.get('timing', False)
     t1 = time.time()
     if self.__filename_compressed_exists and not force:
         sys.stderr.write("%s exists\n" % self.filename_compressed)
         return False
     else:
         try:
             pysam.tabix_compress(self.filename, self.filename_compressed,
                                  force=force)
             self.__filename_compressed_exists = True
             t2 = time.time()
             if timing:
                 sys.stderr.write('Total time: %s\n' % (t2 - t1))
             return True
         except:
             sys.stderr.write('Unexpected error during compression: %s\n' %
                              sys.exc_info()[1])
             return False
Ejemplo n.º 45
0
    def __init__(self, fileNames):
        """
        Constructor. Takes the name of a vcf file.
        """
        for fileName in fileNames:
            if ".gz" not in fileName:
                try:
                    pysam.tabix_compress(fileName, fileName + ".gz")
                    fileName += ".gz"
                except IOError:
                    pass

            try:
                pysam.tabix_index(fileName, preset="vcf")
            except IOError:
                pass

            self.vcfFiles = [
                pysam.Tabixfile(fileName) for fileName in fileNames
            ]
Ejemplo n.º 46
0
def bgzip_index(inFile, outFile, params={}, tabixPath=None):
    assert not inFile.endswith('.gz') and outFile.endswith('.gz')

    log.debug("compressing with bgzip %s -> %s" % (inFile, outFile))
    if tabixPath:
        cmdline = "%s/bgzip -c %s > %s" % (tabixPath, inFile, outFile)
        assert not os.system(cmdline)
    else:
        pysam.tabix_compress(self.tmpFile, self.outFile, force=True)

    log.debug("indexing with tabix: %s" % outFile)
    if tabixPath:
        cmdline = "%s/tabix %s -f" % (tabixPath, outFile)
        if params.get('seq_col') != None:
            cmdline += ' -s %d' % params['seq_col']
        if params.get('start_col') != None:
            cmdline += ' -b %d' % params['start_col']
        if params.get('end_col') != None:
            cmdline += ' -e %d' % params['end_col']
        if params.get('preset') != None:
            cmdline += ' -p %s' % params['preset']
        if params.get('meta_char') != None:
            cmdline += ' -c %s' % params['meta_char']
        if params.get('line_skip') != None:
            cmdline += ' -S %d' % params['line_skip']
        if params.get('zerobased') != None:
            cmdline += ' -0'
        assert not os.system(cmdline)
    else:
        assert not params.get(
            'line_skip'
        ), "error: pysam does not seem to support this option, even though their documentation talks about it"
        pysam.tabix_index(self.outFile,
                          force=True,
                          seq_col=params.get('seq_col'),
                          start_col=params.get('start_col'),
                          end_col=params.get('end_col'),
                          preset=params.get('preset'),
                          meta_char=params.get('meta_char', '#'),
                          zerobased=params.get('zerobased', False))
    return outFile
Ejemplo n.º 47
0
def index_gtf(file_path, output_path, sort=True, force=True):
    # type: (pathlib.Path, pathlib.Path) -> None
    """Compresses and indexes a gtf file using bgzip and tabix."""

    # Sort file before compressing and indexing.
    if sort:
        sorted_path = _append_suffix(output_path, '.srt')
        sort_gtf(file_path, output_path=sorted_path)
    else:
        sorted_path = file_path

    # Gzip and index file.
    pysam.tabix_compress(
        native_str(sorted_path),
        filename_out=native_str(output_path),
        force=force)
    pysam.tabix_index(native_str(output_path), preset='gff', force=force)

    # Clean up sort temp file.
    if sort:
        sorted_path.unlink()
Ejemplo n.º 48
0
def compress_depth_file( filename, outfile=None, delete_file=True):
    """compresses a tab file.

    Args:
        filename (str): file to compress
        outfile ( str): name of compressed file, default is [filename].gz
        delete_file (bool): delete original file after compression, default is True

    Returns:
        filename (str): filename of compressed file

    
    """

    if ( outfile is None):
        outfile = "{}.gz".format( filename )

    pysam.tabix_compress( filename, outfile , force=True )

    if ( delete_file ):
        os.unlink( filename )

    return outfile
Ejemplo n.º 49
0
def genotype_single_sample(bam, vcf_in, out_dir):
    lib_info_json = bam + ".json"
    sample = fetchId(bam)
    out_vcf = os.path.join(out_dir, sample + ".gt.vcf")
    with open(vcf_in, "r") as inf, open(out_vcf, "w") as outf:
        single.sso_genotype(bam_string=bam,
                            vcf_in=inf,
                            vcf_out=outf,
                            min_aligned=20,
                            split_weight=1,
                            disc_weight=1,
                            num_samp=1000000,
                            lib_info_path=lib_info_json,
                            debug=False,
                            ref_fasta=None,
                            sum_quals=False,
                            max_reads=1000,
                            cores=None,
                            batch_size=None)
    out_gz = out_vcf + ".gz"
    tabix_compress(out_vcf, out_gz, force=True)
    tabix_index(out_gz, force=True, preset="vcf")
    return out_gz
Ejemplo n.º 50
0
def output_gff2(gff2_lines, fn):

    out = open(fn + '_tmp', 'w')
    out.write('##gff-version 2\n')
    for c in map(str, range(1, 23)) + ['X', 'Y', 'MT']:
        if c not in gff2_lines:
            continue
        gff2_lines[c] = sorted(gff2_lines[c], key=itemgetter(3, 4))

        for x in gff2_lines[c]:
            x = map(str, x)
            out.write('\t'.join(x) + '\n')
    out.close()

    pysam.tabix_compress(fn + '_tmp', fn + '.gz', force=True)
    pysam.tabix_index(fn + '.gz',
                      seq_col=0,
                      start_col=3,
                      end_col=4,
                      meta_char='#',
                      force=True)

    os.remove(fn + '_tmp')
Ejemplo n.º 51
0
def run_diff(args, bases = 500000):
    """run differential occupancy calling

    """
    chrs = read_chrom_sizes_from_bam(args.bam)
    pwm = PWM.open(args.pwm)
    chunks = ChunkList.read(args.bed, chromDict = chrs, min_offset = args.flank + args.upper/2 + max(pwm.up,pwm.down))
    chunks.merge()
    maxQueueSize = max(2,int(100 * bases / np.mean([chunk.length() for chunk in chunks])))
    #get fragmentsizes
    fragment_dist1 = FragmentMixDistribution(0, upper = args.upper)
    fragment_dist1.fragmentsizes = FragmentSizes(0, args.upper, vals = FragmentSizes.open(args.sizes1).get(0,args.upper))
    fragment_dist1.modelNFR()
    fragment_dist2 = FragmentMixDistribution(0, upper = args.upper)
    fragment_dist2.fragmentsizes = FragmentSizes(0, args.upper, vals = FragmentSizes.open(args.sizes2).get(0,args.upper))
    fragment_dist2.modelNFR()
    params = OccupancyParameters(fragment_dist, args.upper, args.fasta, args.pwm, sep = args.nuc_sep, min_occ = args.min_occ,
            flank = args.flank, bam = args.bam, ci = args.confidence_interval)
    sets = chunks.split(bases = bases)
    pool1 = mp.Pool(processes = max(1,args.cores-1))
    diff_handle = open(args.out + '.occdiff.bed','w')
    diff_handle.close()
    diff_queue = mp.JoinableQueue()
    diff_process = mp.Process(target = _writeDiff, args=(diff_queue, args.out))
    diff_process.start()
    nuc_dist = np.zeros(args.upper)
    for j in sets:
        tmp = pool1.map(_occHelper, zip(j,itertools.repeat(params)))
        for result in tmp:
            diff_queue.put(result[1])
    pool1.close()
    pool1.join()
    diff_queue.put('STOP')
    diff_process.join()
    pysam.tabix_compress(args.out + '.occdiff.bed', args.out + '.occdiff.bed.gz',force = True)
    shell_command('rm ' + args.out + '.occdiff.bed')
    pysam.tabix_index(args.out + '.occdiff.bed.gz', preset = "bed", force = True)
Ejemplo n.º 52
0
def bgzip_index(inFile, outFile, params={}, tabixPath=None):
	assert not inFile.endswith('.gz') and outFile.endswith('.gz')
	
	log.debug("compressing with bgzip %s -> %s" % (inFile, outFile))
	if tabixPath:
		cmdline = "%s/bgzip -c %s > %s" % (tabixPath, inFile, outFile)
		assert not os.system(cmdline)
	else:
		pysam.tabix_compress(self.tmpFile, self.outFile, force=True)
	
	log.debug("indexing with tabix: %s" % outFile)
	if tabixPath:
		cmdline = "%s/tabix %s -f" % (tabixPath, outFile)
		if params.get('seq_col')!=None:
			cmdline += ' -s %d' % params['seq_col']
		if params.get('start_col')!=None:
			cmdline += ' -b %d' % params['start_col']
		if params.get('end_col')!=None:
			cmdline += ' -e %d' % params['end_col']
		if params.get('preset')!=None:
			cmdline += ' -p %s' % params['preset']
		if params.get('meta_char')!=None:
			cmdline += ' -c %s' % params['meta_char']
		if params.get('line_skip')!=None:
			cmdline += ' -S %d' % params['line_skip']
		if params.get('zerobased')!=None:
			cmdline += ' -0'
		assert not os.system(cmdline)
	else:
		assert not params.get('line_skip'), "error: pysam does not seem to support this option, even though their documentation talks about it"
		pysam.tabix_index(self.outFile, force=True,
			seq_col=params.get('seq_col'),
			start_col=params.get('start_col'), end_col=params.get('end_col'),
			preset=params.get('preset'), meta_char=params.get('meta_char','#'),
			zerobased=params.get('zerobased',False))
	return outFile
Ejemplo n.º 53
0
    def _make_vcf_and_read_depths_files(self):
        if not os.path.exists(self.ref_fa + '.fai'):
            pysam.faidx(self.ref_fa)

        tmp_vcf = self.vcf_file + '.tmp'
        with open(tmp_vcf, 'w') as f:
            print(pysam.mpileup(
                '-t',
                'INFO/AD,INFO/ADF,INFO/ADR',
                '-L',
                '99999999',
                '-A',
                '-f',
                self.ref_fa,
                '-u',
                '-v',
                self.bam,
            ),
                  end='',
                  file=f)

        got = vcfcall_ariba.vcfcall_ariba(tmp_vcf, self.outprefix,
                                          self.min_var_read_depth,
                                          self.min_second_var_read_depth,
                                          self.max_allele_freq)
        if got != 0:
            raise Error('Error parsing vcf file. Cannot contine')

        pysam.tabix_compress(self.outprefix + '.read_depths',
                             self.read_depths_file)
        pysam.tabix_index(self.read_depths_file,
                          seq_col=0,
                          start_col=1,
                          end_col=1)
        os.unlink(self.outprefix + '.read_depths')
        os.unlink(tmp_vcf)
Ejemplo n.º 54
0
def run_occ(args):
    """run occupancy calling

    """
    if args.fasta:
        chrs = read_chrom_sizes_from_fasta(args.fasta)
    else:
        chrs = read_chrom_sizes_from_bam(args.bam)
    pwm = PWM.open(args.pwm)
    chunks = ChunkList.read(args.bed, chromDict = chrs, min_offset = args.flank + args.upper/2 + max(pwm.up,pwm.down) + args.nuc_sep/2)
    chunks.slop(chrs, up = args.nuc_sep/2, down = args.nuc_sep/2)
    chunks.merge()
    maxQueueSize = args.cores*10
    fragment_dist = FragmentMixDistribution(0, upper = args.upper)
    if args.sizes is not None:
        tmp = FragmentSizes.open(args.sizes)
        fragment_dist.fragmentsizes = FragmentSizes(0, args.upper, vals = tmp.get(0,args.upper))
    else:
        fragment_dist.getFragmentSizes(args.bam, chunks)
    fragment_dist.modelNFR()
    fragment_dist.plotFits(args.out + '.occ_fit.eps')
    fragment_dist.fragmentsizes.save(args.out + '.fragmentsizes.txt')
    params = OccupancyParameters(fragment_dist, args.upper, args.fasta, args.pwm, sep = args.nuc_sep, min_occ = args.min_occ,
            flank = args.flank, bam = args.bam, ci = args.confidence_interval, step = args.step)
    sets = chunks.split(items = args.cores * 5)
    pool1 = mp.Pool(processes = max(1,args.cores-1))
    out_handle1 = open(args.out + '.occ.bedgraph','w')
    out_handle1.close()
    out_handle2 = open(args.out + '.occ.lower_bound.bedgraph','w')
    out_handle2.close()
    out_handle3 = open(args.out + '.occ.upper_bound.bedgraph','w')
    out_handle3.close()
    write_queue = mp.JoinableQueue(maxsize = maxQueueSize)
    write_process = mp.Process(target = _writeOcc, args=(write_queue, args.out))
    write_process.start()
    peaks_handle = open(args.out + '.occpeaks.bed','w')
    peaks_handle.close()
    peaks_queue = mp.JoinableQueue()
    peaks_process = mp.Process(target = _writePeaks, args=(peaks_queue, args.out))
    peaks_process.start()
    nuc_dist = np.zeros(args.upper)
    for j in sets:
        tmp = pool1.map(_occHelper, zip(j,itertools.repeat(params)))
        for result in tmp:
            nuc_dist += result[0]
            write_queue.put(result[1])
            peaks_queue.put(result[2])
    pool1.close()
    pool1.join()
    write_queue.put('STOP')
    peaks_queue.put('STOP')
    write_process.join()
    peaks_process.join()
    pysam.tabix_compress(args.out + '.occpeaks.bed', args.out + '.occpeaks.bed.gz',force = True)
    shell_command('rm ' + args.out + '.occpeaks.bed')
    pysam.tabix_index(args.out + '.occpeaks.bed.gz', preset = "bed", force = True)
    for i in ('occ','occ.lower_bound','occ.upper_bound'):
        pysam.tabix_compress(args.out + '.' + i + '.bedgraph', args.out + '.'+i+'.bedgraph.gz',force = True)
        shell_command('rm ' + args.out + '.' + i + '.bedgraph')
        pysam.tabix_index(args.out + '.' + i + '.bedgraph.gz', preset = "bed", force = True)

    dist_out = FragmentSizes(0, args.upper, vals = nuc_dist)
    dist_out.save(args.out + '.nuc_dist.txt')

    print "Making figure"
    #make figure
    fig = plt.figure()
    plt.plot(range(0,args.upper),dist_out.get(0,args.upper),label = "Nucleosome Distribution")
    plt.xlabel("Fragment Size")
    plt.ylabel("Frequency")
    fig.savefig(args.out+'.nuc_dist.eps')
    plt.close(fig)
    def run(self):
        if not os.path.exists(self.outputDirectory):
            os.makedirs(self.outputDirectory)

        # Clean out, make and re-populate references directory
        # For now, assume a single, statically-named referenceSet
        print("Converting references...", file=sys.stderr)
        shutil.rmtree(self.refsetsDirectory, ignore_errors=True)
        os.makedirs(self.refsetsDirectory)
        shutil.copy(
            os.path.join(self.inputDirectory, "referenceset_hg37.json"),
            os.path.join(self.refsetsDirectory, "hg37.json"))

        os.makedirs(self.hg37Directory)
        for refFile in self.referenceFiles:
            refBase = os.path.splitext(refFile)[0]
            destFastaFilename = os.path.join(
                self.hg37Directory, refBase) + ".fa"
            shutil.copy(os.path.join(self.inputDirectory, refBase) + ".fa",
                        destFastaFilename)
            pysam.tabix_compress(destFastaFilename, destFastaFilename + ".gz")
            refFasta = pysam.FastaFile(destFastaFilename + ".gz")
            refFasta.close()
            os.remove(destFastaFilename)
            shutil.copy(
                os.path.join(self.inputDirectory, refBase) + ".json",
                os.path.join(self.hg37Directory, refBase) + ".json")

        # Clean out, make and repopulate dataset directories
        shutil.rmtree(self.datasetsDirectory, ignore_errors=True)
        os.makedirs(self.datasetsDirectory)

        for ds in self.datasets:
            dsdir = os.path.join(self.datasetsDirectory, ds)
            os.makedirs(dsdir)

            # Reads
            print("Converting reads...", file=sys.stderr)
            dsReadsdir = os.path.join(dsdir, "reads")
            os.makedirs(dsReadsdir)
            for readFile in self.datasetReads[ds]:
                destFile = os.path.join(
                    dsReadsdir,
                    readFile.split('_')[1].split('.')[0]) + ".bam"
                readSrc = pysam.AlignmentFile(
                    os.path.join(self.inputDirectory, readFile), "r")
                readDest = pysam.AlignmentFile(destFile, "wb",
                                               header=readSrc.header)
                destFilePath = readDest.filename

                for readData in readSrc:
                    readDest.write(readData)
                readDest.close()
                readSrc.close()
                pysam.index(destFilePath)

            # Variants
            print("Converting variants...", file=sys.stderr)
            dsVariantsdir = os.path.join(dsdir, "variants")
            os.makedirs(dsVariantsdir)
            for vgroup in self.datasetVariants[ds].keys():
                vgroupdir = os.path.join(dsVariantsdir, vgroup)
                os.makedirs(vgroupdir)
                for variantFile in self.datasetVariants[ds][vgroup]:
                    destFile = os.path.join(
                        vgroupdir, variantFile.split('_')[2])
                    shutil.copy(
                        os.path.join(
                            self.inputDirectory, variantFile), destFile)
                    # Pysam's tabix_index automatically compresses the file
                    # in place, creates a tabix index.
                    pysam.tabix_index(destFile, preset="vcf")

        print("done converting compliance data.", file=sys.stderr)
Ejemplo n.º 56
0
    def run(self):
        if not os.path.exists(self.outputDirectory):
            os.makedirs(self.outputDirectory)
        self.repo.open("w")
        self.repo.initialise()

        referenceFileName = "ref_brca1.fa"
        inputRef = os.path.join(
            self.inputDirectory, referenceFileName)
        outputRef = os.path.join(
            self.outputDirectory, referenceFileName)
        shutil.copy(inputRef, outputRef)
        fastaFilePath = os.path.join(
            self.outputDirectory,
            referenceFileName + '.gz')
        pysam.tabix_compress(
            outputRef, fastaFilePath)

        with open(
                os.path.join(
                    self.inputDirectory, "ref_brca1.json")) as refMetadataFile:
            refMetadata = json.load(refMetadataFile)
        with open(
                os.path.join(
                    self.inputDirectory,
                    "referenceset_hg37.json")) as refMetadataFile:
            refSetMetadata = json.load(refMetadataFile)

        referenceSet = references.HtslibReferenceSet(
            refSetMetadata['assemblyId'])

        referenceSet.populateFromFile(fastaFilePath)
        referenceSet.setAssemblyId(refSetMetadata['assemblyId'])
        referenceSet.setDescription(refSetMetadata['description'])
        referenceSet.setNcbiTaxonId(refSetMetadata['ncbiTaxonId'])
        referenceSet.setIsDerived(refSetMetadata['isDerived'])
        referenceSet.setSourceUri(refSetMetadata['sourceUri'])
        referenceSet.setSourceAccessions(refSetMetadata['sourceAccessions'])
        for reference in referenceSet.getReferences():
            reference.setNcbiTaxonId(refMetadata['ncbiTaxonId'])
            reference.setSourceAccessions(
                refMetadata['sourceAccessions'])
        self.repo.insertReferenceSet(referenceSet)

        dataset = datasets.Dataset("brca1")
        self.repo.insertDataset(dataset)

        hg00096Individual = biodata.Individual(dataset, "HG00096")
        with open(
                os.path.join(
                    self.inputDirectory,
                    "individual_HG00096.json")) as jsonString:
            hg00096Individual.populateFromJson(jsonString.read())
        self.repo.insertIndividual(hg00096Individual)
        hg00096BioSample = biodata.BioSample(dataset, "HG00096")
        with open(
                os.path.join(
                    self.inputDirectory,
                    "bioSample_HG00096.json")) as jsonString:
            hg00096BioSample.populateFromJson(jsonString.read())
        hg00096BioSample.setIndividualId(hg00096Individual.getId())
        self.repo.insertBioSample(hg00096BioSample)
        hg00099Individual = biodata.Individual(dataset, "HG00099")
        with open(
                os.path.join(
                    self.inputDirectory,
                    "individual_HG00099.json")) as jsonString:
            hg00099Individual.populateFromJson(jsonString.read())
        self.repo.insertIndividual(hg00099Individual)
        hg00099BioSample = biodata.BioSample(dataset, "HG00099")
        with open(
                os.path.join(
                    self.inputDirectory,
                    "bioSample_HG00099.json")) as jsonString:
            hg00099BioSample.populateFromJson(jsonString.read())
        hg00099BioSample.setIndividualId(hg00099Individual.getId())
        self.repo.insertBioSample(hg00099BioSample)
        hg00101Individual = biodata.Individual(dataset, "HG00101")
        with open(
                os.path.join(
                    self.inputDirectory,
                    "individual_HG00101.json")) as jsonString:
            hg00101Individual.populateFromJson(jsonString.read())
        self.repo.insertIndividual(hg00101Individual)
        hg00101BioSample = biodata.BioSample(dataset, "HG00101")
        with open(
                os.path.join(
                    self.inputDirectory,
                    "bioSample_HG00101.json")) as jsonString:
            hg00101BioSample.populateFromJson(jsonString.read())
        hg00101BioSample.setIndividualId(hg00101Individual.getId())
        self.repo.insertBioSample(hg00101BioSample)

        readFiles = [
            "brca1_HG00096.sam",
            "brca1_HG00099.sam",
            "brca1_HG00101.sam"]

        for readFile in readFiles:
            name = readFile.split('_')[1].split('.')[0]
            readSrc = pysam.AlignmentFile(
                os.path.join(self.inputDirectory, readFile), "r")
            readDest = pysam.AlignmentFile(
                os.path.join(
                    self.outputDirectory,
                    name + ".bam"),
                "wb", header=readSrc.header)
            destFilePath = readDest.filename
            for readData in readSrc:
                readDest.write(readData)
            readDest.close()
            readSrc.close()
            pysam.index(destFilePath)
            readGroupSet = reads.HtslibReadGroupSet(dataset, name)
            readGroupSet.populateFromFile(destFilePath, destFilePath + ".bai")
            readGroupSet.setReferenceSet(referenceSet)
            bioSamples = [hg00096BioSample, hg00099BioSample, hg00101BioSample]
            for readGroup in readGroupSet.getReadGroups():
                for bioSample in bioSamples:
                    if bioSample.getLocalId() == readGroup.getSampleName():
                        readGroup.setBioSampleId(bioSample.getId())
            self.repo.insertReadGroupSet(readGroupSet)

        ontologyMapFileName = "so-xp-simple.obo"
        inputOntologyMap = os.path.join(
            self.inputDirectory, ontologyMapFileName)
        outputOntologyMap = os.path.join(
            self.outputDirectory, ontologyMapFileName)
        shutil.copy(inputOntologyMap, outputOntologyMap)

        sequenceOntology = ontologies.Ontology("so-xp-simple")
        sequenceOntology.populateFromFile(outputOntologyMap)
        sequenceOntology._id = "so-xp-simple"
        self.repo.insertOntology(sequenceOntology)
        self.repo.addOntology(sequenceOntology)

        vcfFiles = [
            "brca1_1kgPhase3_variants.vcf",
            "brca1_WASH7P_annotation.vcf",
            "brca1_OR4F_annotation.vcf"]
        for vcfFile in vcfFiles:
            self.addVariantSet(
                vcfFile,
                dataset,
                referenceSet,
                sequenceOntology,
                bioSamples)

        seqAnnFile = "brca1_gencodev19.gff3"
        seqAnnSrc = os.path.join(self.inputDirectory, seqAnnFile)
        seqAnnDest = os.path.join(self.outputDirectory, "gencodev19.db")
        dbgen = generate_gff3_db.Gff32Db(seqAnnSrc, seqAnnDest)
        dbgen.run()
        gencode = sequenceAnnotations.Gff3DbFeatureSet(dataset, "gencodev19")
        gencode.setOntology(sequenceOntology)
        gencode.populateFromFile(seqAnnDest)
        gencode.setReferenceSet(referenceSet)

        self.repo.insertFeatureSet(gencode)

        self.repo.commit()

        print("Done converting compliance data.", file=sys.stderr)
Ejemplo n.º 57
0
def run_nuc(args):
    """run occupancy calling

    """
    vmat = VMat.open(args.vmat)
    if args.fasta:
        chrs = read_chrom_sizes_from_fasta(args.fasta)
    else:
        chrs = read_chrom_sizes_from_bam(args.bam)
    pwm = PWM.open(args.pwm)
    chunks = ChunkList.read(args.bed,
                            chromDict=chrs,
                            min_offset=vmat.mat.shape[1] + vmat.upper // 2 +
                            max(pwm.up, pwm.down) + args.nuc_sep // 2,
                            min_length=args.nuc_sep * 2)
    chunks.slop(chrs, up=args.nuc_sep // 2, down=args.nuc_sep // 2)
    chunks.merge()
    maxQueueSize = args.cores * 10
    if args.sizes is not None:
        fragment_dist = FragmentSizes.open(args.sizes)
    else:
        fragment_dist = FragmentSizes(0, upper=vmat.upper)
        fragment_dist.calculateSizes(args.bam, chunks)
    params = NucParameters(vmat=vmat,
                           fragmentsizes=fragment_dist,
                           bam=args.bam,
                           fasta=args.fasta,
                           pwm=args.pwm,
                           occ_track=args.occ_track,
                           sd=args.sd,
                           nonredundant_sep=args.nuc_sep,
                           redundant_sep=args.redundant_sep,
                           min_z=args.min_z,
                           min_lr=args.min_lr,
                           atac=args.atac)
    sets = chunks.split(items=args.cores * 5)
    pool1 = mp.Pool(processes=max(1, args.cores - 1))
    if args.write_all:
        outputs = [
            'nucpos', 'nucpos.redundant', 'nucleoatac_signal',
            'nucleoatac_signal.smooth', 'nucleoatac_background',
            'nucleoatac_raw'
        ]
    else:
        outputs = [
            'nucpos', 'nucpos.redundant', 'nucleoatac_signal',
            'nucleoatac_signal.smooth'
        ]
    handles = {}
    write_queues = {}
    write_processes = {}
    for i in outputs:
        if i not in ['nucpos', 'nucpos.redundant', 'nfrpos']:
            handles[i] = open(args.out + '.' + i + '.bedgraph', 'w')
        else:
            handles[i] = open(args.out + '.' + i + '.bed', 'w')
        handles[i].close()
        write_queues[i] = mp.JoinableQueue(maxsize=maxQueueSize)
        write_processes[i] = mp.Process(target=_writeFuncs[i],
                                        args=(write_queues[i], args.out))
        write_processes[i].start()
    for j in sets:
        tmp = pool1.map(_nucHelper, list(zip(j, itertools.repeat(params))))
        for result in tmp:
            for i in outputs:
                write_queues[i].put(result[i])
    pool1.close()
    pool1.join()
    for i in outputs:
        write_queues[i].put('STOP')
    for i in outputs:
        write_processes[i].join()
        if i not in ['nucpos', 'nucpos.redundant']:
            pysam.tabix_compress(args.out + '.' + i + '.bedgraph',
                                 args.out + '.' + i + '.bedgraph.gz',
                                 force=True)
            shell_command('rm ' + args.out + '.' + i + '.bedgraph')
            pysam.tabix_index(args.out + '.' + i + '.bedgraph.gz',
                              preset="bed",
                              force=True)
        else:
            pysam.tabix_compress(args.out + '.' + i + '.bed',
                                 args.out + '.' + i + '.bed.gz',
                                 force=True)
            shell_command('rm ' + args.out + '.' + i + '.bed')
            pysam.tabix_index(args.out + '.' + i + '.bed.gz',
                              preset="bed",
                              force=True)
Ejemplo n.º 58
0
 def testCompression(self):
     '''see also issue 106'''
     pysam.tabix_compress(self.tmpfilename, self.tmpfilename + ".gz")
     checkBinaryEqual(self.tmpfilename, self.tmpfilename + ".gz")