Python tabix_index Examples, pysam.tabix_index Python Examples

Example #1

0

Show file

File: remove_nonstandard_variants.py Project: NCI-GDC/gdc-sanger-somatic-tool

def main(args, logger):
    """
    Main wrapper script for removing non-standard variants
    """
    # Allowed
    good = set(['A', 'T', 'C', 'G'])

    # Reader
    reader = pysam.VariantFile(args.input_vcf)

    # Writer
    mode = 'wz' if args.output_filename.endswith('gz') else 'w'
    writer = pysam.VariantFile(args.output_filename,
                               mode=mode,
                               header=reader.header)

    # Process
    try:
        for record in reader.fetch():
            alleles = list(record.alleles)
            alleles_set = set(list(''.join(alleles).upper()))
            check = alleles_set - good
            if check:
                logger.warning('Removing %s:%s:%s', record.chrom, record.pos,
                               ','.join(alleles))
                continue
            else:
                writer.write(record)

    finally:
        reader.close()
        writer.close()

    if mode == 'wz':
        pysam.tabix_index(args.output_filename, preset='vcf', force=True)

Example #2

0

Show file

File: snpeff.py Project: mlin/viral-ngs

    def eff_vcf(self, inVcf, outVcf, genome, java_flags='-Xmx2g',
            in_format='vcf', out_format='vcf', eff_options=''):
        """
        TODO: docstring here
        """
        if outVcf.endswith('.vcf.gz'):
            tmpVcf = util.file.mkstempfname(prefix='vcf_snpEff-', suffix='.vcf')
        else:
            tmpVcf = outVcf

        args = ' '.join([
                'eff',
                    '-c', '{}/snpEff.config'.format(self.executable_path()),
                    '-i', in_format,
                    '-o', out_format,
                    genome,
                    '-treatAllAsProteinCoding false',
                    '-noLog',
                    '-ud 0',
                    '-noStats',
                    eff_options
                ])

        if inVcf.endswith('.gz'):
            pre_pipe = "zcat {} | ".format(inVcf)
        else:
            pre_pipe = "cat {} | ".format(inVcf)
        post_pipe = " > {}".format(tmpVcf)
        self.execute(args, java_flags=java_flags, pre_pipe=pre_pipe,
                post_pipe=post_pipe)
        
        if outVcf.endswith('.vcf.gz'):
            pysam.tabix_compress(tmpVcf, outVcf, force=True)
            pysam.tabix_index(outVcf, force=True, preset='vcf')
            os.unlink(tmpVcf)

Example #3

0

Show file

File: snpeff.py Project: ACEGID-Senegal/viral-ngs

    def annotate_vcf(self, inVcf, genome, outVcf, JVMmemory=None):
        """
        Annotate variants in VCF file with translation consequences using snpEff.
        """
        if outVcf.endswith('.vcf.gz'):
            tmpVcf = util.file.mkstempfname(prefix='vcf_snpEff-', suffix='.vcf')
        elif outVcf.endswith('.vcf'):
            tmpVcf = outVcf
        else:
            raise Exception("invalid input")

        args = [
            '-treatAllAsProteinCoding', 'false',
            '-t',
            '-noLog',
            '-ud', '0',
            '-noStats',
            '-noShiftHgvs',
            genome,
            inVcf
            ]
        with open(tmpVcf, 'wt') as outf:
            self.execute('ann', args, JVMmemory=JVMmemory, stdout=outf)
        
        if outVcf.endswith('.vcf.gz'):
            pysam.tabix_compress(tmpVcf, outVcf, force=True)
            pysam.tabix_index(outVcf, force=True, preset='vcf')
            os.unlink(tmpVcf)

Example #4

0

Show file

def indexFile(options):
    filename = options.output
    if not options.ensembl is None:
        sys.stdout.write('Compressing output file... ')
        sys.stdout.flush()
        pysam.tabix_compress(filename, filename + '.gz', force=True)
        sys.stdout.write('OK\n')
        sys.stdout.write('Indexing output file... ')
        sys.stdout.flush()
        pysam.tabix_index(filename + '.gz',
                          seq_col=2,
                          start_col=4,
                          end_col=5,
                          meta_char='#',
                          force=True)
        sys.stdout.write('OK\n')
    else:
        print 'Compressing file...'
        pysam.tabix_compress(filename, filename + '.gz', force=True)
        print 'Indexing file...'
        pysam.tabix_index(filename + '.gz',
                          seq_col=1,
                          start_col=2,
                          end_col=2,
                          meta_char='#',
                          force=True)

Example #5

0

Show file

File: tabix.py Project: apregier/svviz

def ensureIndexed(bedPath, preset="bed", trySorting=True):
    if not bedPath.endswith(".gz"):
        if not os.path.exists(bedPath + ".gz"):
            logging.info("bgzf compressing {}".format(bedPath))
            pysam.tabix_compress(bedPath, bedPath + ".gz")
            if not os.path.exists(bedPath + ".gz"):
                raise Exception(
                    "Failed to create compress {preset} file for {file}; make sure the {preset} file is "
                    "sorted and the directory is writeable".format(preset=preset, file=bedPath)
                )
        bedPath += ".gz"
    if not os.path.exists(bedPath + ".tbi"):
        logging.info("creating tabix index for {}".format(bedPath))
        pysam.tabix_index(bedPath, preset=preset)
        if not os.path.exists(bedPath + ".tbi"):
            raise Exception(
                "Failed to create tabix index file for {file}; make sure the {preset} file is "
                "sorted and the directory is writeable".format(preset=preset, file=bedPath)
            )

    line = pysam.Tabixfile(bedPath).fetch().next()
    if len(line.strip().split("\t")) < 6 and preset == "bed":
        raise AnnotationError(
            "BED files need to have at least 6 (tab-delimited) fields (including "
            "chrom, start, end, name, score, strand; score is unused)"
        )
    if len(line.strip().split("\t")) < 9 and preset == "gff":
        raise AnnotationError("GFF/GTF files need to have at least 9 tab-delimited fields")

    return bedPath

Example #6

0

Show file

File: tabix_test.py Project: carolinux/pysam

    def testIndexPresetCompressed(self):
        '''test indexing via preset.'''

        pysam.tabix_compress(self.tmpfilename, self.tmpfilename + ".gz")
        pysam.tabix_index(self.tmpfilename + ".gz", preset=self.preset)
        checkBinaryEqual(self.tmpfilename + ".gz", self.filename)
        checkBinaryEqual(self.tmpfilename + ".gz.tbi", self.filename_idx)

Example #7

0

Show file

File: testAnnotator.py Project: andrewparkermorgan/lapels

 def batchTestHelper(self, modFile, pool, refLens):                
     tmpName = tempfile.mkstemp('.tsv')[1]
     tmpfp = open(tmpName, 'wb')
     for line in modFile:
         tmpfp.write(line)
     tmpfp.close()
     pysam.tabix_index(tmpName, force=True, seq_col=1, start_col=2, end_col=2, 
                   meta_char='#', zerobased=True)
     tmpName += '.gz'
     modFile.close()
     
     self.chromoID = '1'
     self.modobj = mod.Mod(tmpName)
     self.modobj.load(self.chromoID)
     
     for tup in pool:       
         bamIter=[Read(tup[0], tup[1]+1, tup[2]) for tup in pool]        
                                
     a = annot.Annotator(self.chromoID, refLens[self.chromoID],
                             self.modobj, bamIter)
     results = a.execute()
     
     for i,res in enumerate(results):            
         self.assertEqual(polish(res[0]),pool[i][3])
         self.assertEqual(res[1], pool[i][4])
         self.assertEqual(res[2], pool[i][5])
         self.assertEqual(res[3], pool[i][6])
         self.assertEqual(res[4], pool[i][7])
     
     os.remove(tmpName)
     os.remove(tmpName+'.tbi')

Example #8

0

Show file

File: make_bias_track.py Project: kesteph/NucleoATAC

def make_bias_track(args, bases = 500000, splitsize = 1000):
    """function to compute bias track

    """
    if args.out is None:
        if args.bed is not None:
            args.out = '.'.join(os.path.basename(args.bed).split('.')[0:-1])
        else:
            args.out = '.'.join(os.path.basename(args.fasta).split('.')[0:-1])
    params = _BiasParams(args.fasta, args.pwm)
    if args.bed is None:
        chunks = ChunkList.convertChromSizes(params.chrs, splitsize = splitsize)
        sets = chunks.split(items = bases/splitsize)
    else:
        chunks = ChunkList.read(args.bed)
        chunks.merge()
        sets = chunks.split(bases = bases)
    maxQueueSize = max(2,int(2 * bases / np.mean([chunk.length() for chunk in chunks])))
    pool = mp.Pool(processes = max(1,args.cores-1))
    out_handle = open(args.out + '.Scores.bedgraph','w')
    out_handle.close()
    write_queue = mp.JoinableQueue(maxsize = maxQueueSize)
    write_process = mp.Process(target = _writeBias, args=(write_queue, args.out))
    write_process.start()
    for j in sets:
        tmp = pool.map(_biasHelper, zip(j,itertools.repeat(params)))
        for track in tmp:
            write_queue.put(track)
    pool.close()
    pool.join()
    write_queue.put('STOP')
    write_process.join()
    pysam.tabix_compress(args.out + '.Scores.bedgraph', args.out + '.Scores.bedgraph.gz', force = True)
    shell_command('rm ' + args.out + '.Scores.bedgraph')
    pysam.tabix_index(args.out + '.Scores.bedgraph.gz', preset = "bed", force = True)

Example #9

0

Show file

File: tabix_test.py Project: humburg/pysam

    def testIndexPresetCompressed(self):
        '''test indexing via preset.'''

        pysam.tabix_compress(self.tmpfilename, self.tmpfilename + ".gz")
        pysam.tabix_index(self.tmpfilename + ".gz", preset=self.preset)
        checkBinaryEqual(self.tmpfilename + ".gz", self.filename)
        checkBinaryEqual(self.tmpfilename + ".gz.tbi", self.filename_idx)

Example #10

0

Show file

File: cluster.py Project: aslett1/ariba

    def _make_assembly_vcf(self):
        tmp_vcf = self.final_assembly_vcf + '.tmp'
        cmd = ' '.join([
            self.samtools_exe, 'mpileup', '-t INFO/DPR,DV', '-A', '-f',
            self.final_assembly_fa, '-u', '-v', self.final_assembly_bam, '>',
            tmp_vcf
        ])

        common.syscall(cmd, verbose=self.verbose)

        cmd = ' '.join([
            self.bcftools_exe, 'call -m', tmp_vcf, '|', self.bcftools_exe,
            'query', r'''-f '%CHROM\t%POS\t%REF\t%ALT\t%DP\t%DPR]\n' ''', '>',
            self.final_assembly_read_depths + '.tmp'
        ])

        common.syscall(cmd, verbose=self.verbose)
        pysam.tabix_compress(self.final_assembly_read_depths + '.tmp',
                             self.final_assembly_read_depths)
        pysam.tabix_index(self.final_assembly_read_depths,
                          seq_col=0,
                          start_col=1,
                          end_col=1)
        os.unlink(self.final_assembly_read_depths + '.tmp')

        cmd = ' '.join([
            self.bcftools_exe, 'call -m -v', tmp_vcf, '|', self.bcftools_exe,
            'filter', '-i', '"MIN(DP)>=' + str(self.bcf_min_dp),
            ' & MIN(DV)>=' + str(self.bcf_min_dv),
            ' & MIN(DV/DP)>=' + str(self.bcf_min_dv_over_dp), ' & QUAL >=',
            str(self.bcf_min_qual), '"', '-o', self.final_assembly_vcf
        ])

        common.syscall(cmd, verbose=self.verbose)
        os.unlink(tmp_vcf)

Example #11

0

Show file

    def addVariantSet(
            self, variantFileName, dataset, referenceSet,
            ontology, biosamples):
        inputVcf = os.path.join(
            self.inputDirectory, variantFileName)
        outputVcf = os.path.join(
            self.outputDirectory, variantFileName)
        shutil.copy(inputVcf, outputVcf)
        pysam.tabix_index(outputVcf, preset="vcf")
        variantSet = variants.HtslibVariantSet(
            dataset, variantFileName.split('_')[1])
        variantSet.setReferenceSet(referenceSet)
        variantSet.populateFromFile(
            [os.path.abspath(outputVcf + ".gz")],
            [os.path.abspath(outputVcf + ".gz.tbi")])
        variantSet.checkConsistency()
        for callSet in variantSet.getCallSets():
            for biosample in biosamples:
                if biosample.getLocalId() == callSet.getLocalId():
                    callSet.setBiosampleId(biosample.getId())
        self.repo.insertVariantSet(variantSet)

        for annotationSet in variantSet.getVariantAnnotationSets():
            annotationSet.setOntology(ontology)
            self.repo.insertVariantAnnotationSet(annotationSet)

Example #12

0

Show file

def classDisbyMotif(paras):
    path_dis = paras["output_dis"]
    path_dis_parameter = paras["output_tmp"]
    min_support_reads = paras["minimum_support_reads"]
    print("[INFO] Scanning the distribution file of microsatellite!")
    vcffile = pysam.VariantFile(path_dis)
    File_motif = {}

    recordNum = 0
    for rec in vcffile.fetch():
        recordNum += 1
        recordInfo = rec.info
        motif = recordInfo["motif"]
        support_reads = int(recordInfo["support_reads"].split("|")[0])
        if support_reads > min_support_reads:
            if motif not in File_motif:
                File_motif[motif] = pysam.VariantFile(path_dis_parameter + "/tmp_motif_" + motif + ".vcf.gz", 'w',
                                                      header=vcffile.header)
            File_motif[motif].write(rec)

    motifList = []
    for motif in File_motif:
        File_motif[motif].close()
        pysam.tabix_index(path_dis_parameter + "/tmp_motif_" + motif + ".vcf.gz", force=True, preset="vcf")

        motifList.append(motif)
    set_value("motifList", motifList)

Example #13

0

Show file

    def make_index(file_name):
        """Make index file for input file"""
        f_bs, f_ext = os.path.splitext(file_name)

        def indexed(fn, ext):
            return os.path.exists(fn + ext)

        def uptodate(fn, ext):
            return os.getmtime(fn) < os.getmtime(fn + ext)

        infomsg = "{} was indexed and is uptodate. Skipping".format(file_name)
        if f_ext == ".fa":
            if indexed(file_name, ".fai") and uptodate(file_name, ".fai"):
                print(infomsg)
            else:
                pysam.faidx(file_name)
        elif f_ext in [".bam", ".cram"]:
            if indexed(file_name, ".bai") and uptodate(file_name, ".bai"):
                print(infomsg)
            else:
                pysam.index(file_name)
        elif f_ext in [".gff", ".bed", ".vcf", ".sam"]:
            if indexed(file_name, ".gz.tbi") and uptodate(
                    file_name, ".gz.tbi"):
                print(infomsg)
            else:
                pysam.tabix_index(file_name, preset=f_ext.replace(".", ""))

Example #14

0

Show file

File: get_cov.py Project: JordiAlbert/NucleoATAC

def get_cov(args, bases = 50000, splitsize = 1000):
    """function to get coverages

    """
    if not args.out:
        if args.bed is None:
            args.out = '.'.join(os.path.basename(args.bam).split('.')[0:-1])
        else:
            args.out = '.'.join(os.path.basename(args.bed).split('.')[0:-1])
    if args.bed is None:
        chrs = read_chrom_sizes_from_bam(args.bam)
        chunks = ChunkList.convertChromSizes(chrs, splitsize = splitsize)
        sets = chunks.split(items = bases/splitsize)
    else:
        chunks = ChunkList.read(args.bed)
        chunks.merge()
        sets = chunks.split(bases = bases)
    maxQueueSize = max(2,int(2 * bases / np.mean([chunk.length() for chunk in chunks])))
    pool1 = mp.Pool(processes = max(1,args.cores-1))
    out_handle = open(args.out + '.cov.bedgraph','w')
    out_handle.close()
    write_queue = mp.JoinableQueue(maxsize = maxQueueSize)
    write_process = mp.Process(target = _writeCov, args=(write_queue, args.out))
    write_process.start()
    for j in sets:
        tmp = pool1.map(_covHelper, zip(j,itertools.repeat(args)))
        for track in tmp:
            write_queue.put(track)
    pool1.close()
    pool1.join()
    write_queue.put('STOP')
    write_process.join()
    pysam.tabix_compress(args.out + '.cov.bedgraph', args.out + '.cov.bedgraph.gz', force = True)
    shell_command('rm ' + args.out + '.cov.bedgraph')
    pysam.tabix_index(args.out + '.cov.bedgraph.gz', preset = "bed", force = True)

Example #15

0

Show file

File: estimate_error.py Project: PengJia6/MSHunter

def class_dis_by_motif(paras):
    path_pre = paras["output_pre"]
    path_dis_parameter = paras["output_tmp"]
    min_support_reads = paras["minimum_support_reads"]
    logger.info("Scanning the distribution file of microsatellite!")
    vcf_file = pysam.VariantFile(path_pre)
    files_motif = {}
    record_num = 0
    for rec in vcf_file.fetch():
        record_num += 1
        record_info = rec.info
        motif = record_info["motif"]
        support_reads = record_info["depth"]
        if support_reads > min_support_reads:
            if motif not in files_motif:
                files_motif[motif] = pysam.VariantFile(
                    path_dis_parameter + "/tmp_motif_" + motif + ".vcf.gz",
                    'w',
                    header=vcf_file.header)
            files_motif[motif].write(rec)

    motif_list = []
    for motif in files_motif:
        files_motif[motif].close()
        pysam.tabix_index(path_dis_parameter + "/tmp_motif_" + motif +
                          ".vcf.gz",
                          force=True,
                          preset="vcf")

        motif_list.append(motif)
    set_value("motif_list", motif_list)

Example #16

0

Show file

File: tabular_to_dbnsfp.py Project: BiocrucesBizkaia/galaxy-mikel

def main():
    # Read options, args.
    usage = "Usage: %prog [options] tabular_input_file bgzip_output_file"
    parser = optparse.OptionParser(usage=usage)
    parser.add_option('-c',
                      '--chr-col',
                      type='int',
                      default=0,
                      dest='chrom_col')
    parser.add_option('-s',
                      '--start-col',
                      type='int',
                      default=1,
                      dest='start_col')
    parser.add_option('-e', '--end-col', type='int', default=1, dest='end_col')
    (options, args) = parser.parse_args()
    if len(args) != 2:
        parser.print_usage()
        exit(1)
    input_fname, output_fname = args
    output_dir = os.path.dirname(output_fname)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    pysam.tabix_compress(input_fname, output_fname, force=True)
    # Column indices are 0-based.
    pysam.tabix_index(output_fname,
                      seq_col=options.chrom_col,
                      start_col=options.start_col,
                      end_col=options.end_col)

Example #17

0

Show file

File: interval_to_tabix_converter.py Project: PeterKoza/Galaxy-fmfi

def main():
    # Read options, args.
    parser = optparse.OptionParser()
    parser.add_option('-c', '--chr-col', type='int', dest='chrom_col')
    parser.add_option('-s', '--start-col', type='int', dest='start_col')
    parser.add_option('-e', '--end-col', type='int', dest='end_col')
    parser.add_option('-P', '--preset', dest='preset')
    (options, args) = parser.parse_args()
    input_fname, index_fname, out_fname = args

    # Create index.
    if options.preset:
        # Preset type.
        pysam.tabix_index(filename=index_fname,
                          preset=options.preset,
                          keep_original=True,
                          index_filename=out_fname)
    else:
        # For interval files; column indices are 0-based.
        pysam.tabix_index(filename=index_fname,
                          seq_col=(options.chrom_col - 1),
                          start_col=(options.start_col - 1),
                          end_col=(options.end_col - 1),
                          keep_original=True,
                          index_filename=out_fname)
    if os.path.getsize(index_fname) == 0:
        sys.stderr.write(
            "The converted tabix index file is empty, meaning the input data is invalid."
        )

Example #18

0

Show file

def compressVcf(vcfname,forceflag=False,remove=False):
    """Runs bgzip and tabix on input VCF file.

    Using the pysam library, this function runs the bgzip and tabix utilities
    on the given input file. By default, this will not overwrite an existing
    zipped file, but will overwrite an existing index. `remove` can be set to
    delete the unzipped file.

    Parameters
    ----------
    vcfname : str
        Name of uncompressed VCF file
    forceflag : bool (False)
        If true, will overwrite (vcfname).gz if it exists
    remove : bool (False)
        If true, will delete uncompressed source file

    Returns
    -------
    cvcfname : str
        Filepath to compressed VCF file
    """
    cvcfname = vcfname+".gz"
    pysam.tabix_compress(vcfname,cvcfname,force=forceflag)
    pysam.tabix_index(cvcfname,preset="vcf",force=True)
    if remove:
        os.remove(vcfname)
    return cvcfname

Example #19

0

Show file

def ensureIndexed(bedPath, preset="bed", trySorting=True):
    if not bedPath.endswith(".gz"):
        if not os.path.exists(bedPath + ".gz"):
            logging.info("bgzf compressing {}".format(bedPath))
            pysam.tabix_compress(bedPath, bedPath + ".gz")
            if not os.path.exists(bedPath + ".gz"):
                raise Exception(
                    "Failed to create compress {preset} file for {file}; make sure the {preset} file is "
                    "sorted and the directory is writeable".format(
                        preset=preset, file=bedPath))
        bedPath += ".gz"
    if not os.path.exists(bedPath + ".tbi"):
        logging.info("creating tabix index for {}".format(bedPath))
        pysam.tabix_index(bedPath, preset=preset)
        if not os.path.exists(bedPath + ".tbi"):
            raise Exception(
                "Failed to create tabix index file for {file}; make sure the {preset} file is "
                "sorted and the directory is writeable".format(preset=preset,
                                                               file=bedPath))

    line = pysam.Tabixfile(bedPath).fetch().next()
    if len(line.strip().split("\t")) < 6 and preset == "bed":
        raise AnnotationError(
            "BED files need to have at least 6 (tab-delimited) fields (including "
            "chrom, start, end, name, score, strand; score is unused)")
    if len(line.strip().split("\t")) < 9 and preset == "gff":
        raise AnnotationError(
            "GFF/GTF files need to have at least 9 tab-delimited fields")

    return bedPath

Example #20

0

Show file

def to_tabix(bgzip_fname,
             out_fname,
             preset=None,
             chrom_col=None,
             start_col=None,
             end_col=None):
    # Create index.
    if preset:
        # Preset type.
        bgzip_fname = pysam.tabix_index(filename=bgzip_fname,
                                        preset=preset,
                                        keep_original=True,
                                        index=out_fname,
                                        force=True)
    else:
        # For interval files; column indices are 0-based.
        bgzip_fname = pysam.tabix_index(filename=bgzip_fname,
                                        seq_col=(chrom_col - 1),
                                        start_col=(start_col - 1),
                                        end_col=(end_col - 1),
                                        keep_original=True,
                                        index=out_fname,
                                        force=True)
    if os.path.getsize(out_fname) == 0:
        sys.exit(
            "The converted tabix index file is empty, meaning the input data is invalid."
        )
    return bgzip_fname

Example #21

0

Show file

File: matrix.py Project: agus-setiawan-desu/pheweb

def run(argv):

    if '-h' in argv or '--help' in argv:
        print('Make a single large tabixed file of all phenotypes data')
        exit(1)

    if should_run():
        # we don't need `ffi.new('char[]', ...)` because args are `const`
        ret = lib.cffi_make_matrix(
            sites_filepath.encode('utf8'),
            common_filepaths['pheno']('*').encode('utf8'),
            matrix_gz_tmp_filepath.encode('utf8'))
        ret_bytes = ffi.string(ret, maxlen=1000)
        if ret_bytes != b'ok':
            raise PheWebError(
                'The portion of `pheweb matrix` written in c++/cffi failed with the message '
                + repr(ret_bytes))
        os.rename(matrix_gz_tmp_filepath, matrix_gz_filepath)
        pysam.tabix_index(
            filename=matrix_gz_filepath,
            force=True,
            seq_col=0,
            start_col=1,
            end_col=1  # note: these are 0-based, but `/usr/bin/tabix` is 1-based
        )
    else:
        print('matrix is up-to-date!')

Example #22

0

Show file

def tabix_bedgraph(bedgraph):
    pysam.tabix_compress(bedgraph, bedgraph + '.gz')
    pysam.tabix_index(bedgraph + '.gz',
                      seq_col=0,
                      start_col=1,
                      end_col=2,
                      zerobased=True)

Example #23

0

Show file

File: prepare_compliance_data.py Project: ga4gh/server

    def addVariantSet(
            self, variantFileName, dataset, referenceSet,
            ontology, biosamples):
        inputVcf = os.path.join(
            self.inputDirectory, variantFileName)
        outputVcf = os.path.join(
            self.outputDirectory, variantFileName)
        shutil.copy(inputVcf, outputVcf)
        pysam.tabix_index(outputVcf, preset="vcf")
        variantSet = variants.HtslibVariantSet(
            dataset, variantFileName.split('_')[1])
        variantSet.setReferenceSet(referenceSet)
        variantSet.populateFromFile(
            [os.path.abspath(outputVcf + ".gz")],
            [os.path.abspath(outputVcf + ".gz.tbi")])
        variantSet.checkConsistency()
        for callSet in variantSet.getCallSets():
            for biosample in biosamples:
                if biosample.getLocalId() == callSet.getLocalId():
                    callSet.setBiosampleId(biosample.getId())
        self.repo.insertVariantSet(variantSet)

        for annotationSet in variantSet.getVariantAnnotationSets():
            annotationSet.setOntology(ontology)
            self.repo.insertVariantAnnotationSet(annotationSet)

Example #24

0

Show file

File: gff3_indexer.py Project: Multiscale-Genomics/mg-process-files

    def gff32tabix(self, file_sorted_gff3, file_sorted_gz_gff3, file_gff3_tbi):  # pylint: disable=no-self-use,unused-argument
        """
        GFF3 to Tabix

        Compresses the sorted GFF3 file and then uses Tabix to generate an index
        of the GFF3 file.

        Parameters
        ----------
        file_sorted_gff3 : str
            Location of a sorted GFF3 file
        file_sorted_gz_gff3 : str
            Location of the bgzip compressed GFF3 file
        file_gff3_tbi : str
            Location of the Tabix index file

        Example
        -------
        .. code-block:: python
           :linenos:

           if not self.gff32tabix(self, file_sorted_gff3, gz_file, tbi_file):
               output_metadata.set_exception(
                   Exception(
                       "gff32tabix: Could not process files {}, {}.".format(*input_files)))
        """
        pysam.tabix_compress(file_sorted_gff3, file_sorted_gz_gff3)  # pylint: disable=no-member
        pysam.tabix_index(file_sorted_gz_gff3, preset='gff')  # pylint: disable=no-member
        return True

Example #25

0

Show file

File: snpeff.py Project: mypandos/viral-ngs

    def annotate_vcf(self, inVcf, genomes, outVcf, emailAddress, JVMmemory=None):
        """
        Annotate variants in VCF file with translation consequences using snpEff.
        """
        if outVcf.endswith('.vcf.gz'):
            tmpVcf = util.file.mkstempfname(prefix='vcf_snpEff-', suffix='.vcf')
        elif outVcf.endswith('.vcf'):
            tmpVcf = outVcf
        else:
            raise Exception("invalid input")

        sortedAccessionString = ", ".join(sorted(genomes))
        databaseId = hashlib.sha256(sortedAccessionString.encode('utf-8')).hexdigest()[:55]

        genomeToUse = ""

        # if we don't have the genome, by name (snpEff official) or by hash (custom)
        if (not self.has_genome(databaseId)):
            if (not self.has_genome(genomes[0])):
                _log.info("Checking for snpEff database online...")
                # check to see if it is available for download, and if so install it
                for row in self.available_databases():
                    if (genomes[0].lower() in row['Genome'].lower()) or (
                        genomes[0].lower() in row['Bundle'].lower()
                    ) or (
                        genomes[0].lower() in row['Organism'].lower()
                    ):
                        self.download_db(row['Genome'])

        # backward compatability for where a single genome name is provided
        if self.has_genome(genomes[0]):
            genomeToUse = genomes[0]
        else:
            # if the hash of the accessions passed in is not present in the genomes db
            if not self.has_genome(databaseId):
                self.create_db(genomes, emailAddress, JVMmemory)

            if self.has_genome(databaseId):
                genomeToUse = databaseId

        if not genomeToUse:
            raise Exception()

        args = [
            '-treatAllAsProteinCoding', 'false', '-t', '-noLog', '-ud', '0', '-noStats', '-noShiftHgvs', genomeToUse,
            os.path.realpath(inVcf)
        ]

        command_ps = self.execute('ann', args, JVMmemory=JVMmemory)
        if command_ps.returncode == 0:
            with open(tmpVcf, 'wt') as outf:
               outf.write(command_ps.stdout.decode("utf-8"))

            if outVcf.endswith('.vcf.gz'):
                pysam.tabix_compress(tmpVcf, outVcf, force=True)
                pysam.tabix_index(outVcf, force=True, preset='vcf')
                os.unlink(tmpVcf)
        else:
            raise subprocess.CalledProcessError(cmd=command_ps.args, returncode=command_ps.returncode, output=command_ps.stdout)

Example #26

0

Show file

File: snpeff.py Project: tom-dyar/viral-ngs

    def annotate_vcf(self, inVcf, genomes, outVcf, emailAddress, JVMmemory=None):
        """
        Annotate variants in VCF file with translation consequences using snpEff.
        """
        if outVcf.endswith('.vcf.gz'):
            tmpVcf = util.file.mkstempfname(prefix='vcf_snpEff-', suffix='.vcf')
        elif outVcf.endswith('.vcf'):
            tmpVcf = outVcf
        else:
            raise Exception("invalid input")

        sortedAccessionString = ", ".join([util.genbank.parse_accession_str(acc) for acc in sorted(genomes)])
        databaseId = hashlib.sha256(sortedAccessionString.encode('utf-8')).hexdigest()[:55]

        genomeToUse = ""

        # if we don't have the genome, by name (snpEff official) or by hash (custom)
        if (not self.has_genome(databaseId)):
            if (not self.has_genome(genomes[0])):
                _log.info("Checking for snpEff database online...")
                # check to see if it is available for download, and if so install it
                for row in self.available_databases():
                    if (genomes[0].lower() in row['Genome'].lower()) or (
                        genomes[0].lower() in row['Bundle'].lower()
                    ) or (
                        genomes[0].lower() in row['Organism'].lower()
                    ):
                        self.download_db(row['Genome'])

        # backward compatability for where a single genome name is provided
        if self.has_genome(genomes[0]):
            genomeToUse = genomes[0]
        else:
            # if the hash of the accessions passed in is not present in the genomes db
            if not self.has_genome(databaseId):
                self.create_db(genomes, emailAddress, JVMmemory)

            if self.has_genome(databaseId):
                genomeToUse = databaseId

        if not genomeToUse:
            raise Exception()

        args = [
            '-treatAllAsProteinCoding', 'false', '-t', '-noLog', '-ud', '0', '-noStats', '-noShiftHgvs', genomeToUse,
            os.path.realpath(inVcf)
        ]

        command_ps = self.execute('ann', args, JVMmemory=JVMmemory)
        if command_ps.returncode == 0:
            with open(tmpVcf, 'wt') as outf:
               outf.write(command_ps.stdout.decode("utf-8"))

            if outVcf.endswith('.vcf.gz'):
                pysam.tabix_compress(tmpVcf, outVcf, force=True)
                pysam.tabix_index(outVcf, force=True, preset='vcf')
                os.unlink(tmpVcf)
        else:
            raise subprocess.CalledProcessError(cmd=command_ps.args, returncode=command_ps.returncode, output=command_ps.stdout)

Example #27

0

Show file

    def testEmptyFileVCFGZWithIndex(self):
        with get_temp_context("tmp_testEmptyFile.vcf") as fn:
            with open(fn, "w"):
                pass
            # tabix_index will automatically compress
            pysam.tabix_index(fn, preset="vcf", force=True)

            self.assertRaises(ValueError, pysam.VariantFile, fn + ".gz")

Example #28

0

Show file

File: tabix_test.py Project: pkaleta/pysam

    def setUp( self ):
        
        self.tmpfilename = "tmp_%s.vcf" % id(self)
        shutil.copyfile( self.filename, self.tmpfilename )
        pysam.tabix_index( self.tmpfilename, preset = "vcf" )

        self.tabix = pysam.Tabixfile( self.tmpfilename + ".gz" )
        self.compare = [ x[:-1].split("\t") for x in open( self.filename, "r") if not x.startswith("#") ]

Example #29

0

Show file

File: tabix_test.py Project: msto/pysam

    def test_indexing_to_custom_location_works(self):
        '''test indexing a file with a non-default location.'''

        index_path = get_temp_filename(suffix='custom.tbi')
        pysam.tabix_index(self.tmpfilename, preset="gff",
                          index=index_path, force=True)
        self.assertTrue(checkBinaryEqual(index_path, self.filename_idx))
        os.unlink(index_path)

Example #30

0

Show file

File: tabix_test.py Project: carolinux/pysam

    def testIndexPresetUncompressed(self):
        '''test indexing via preset.'''

        pysam.tabix_index(self.tmpfilename, preset=self.preset)
        # check if uncompressed file has been removed
        self.assertEqual(os.path.exists(self.tmpfilename), False)
        checkBinaryEqual(self.tmpfilename + ".gz", self.filename)
        checkBinaryEqual(self.tmpfilename + ".gz.tbi", self.filename_idx)

Example #31

0

Show file

File: tabix_test.py Project: humburg/pysam

    def testIndexPresetUncompressed(self):
        '''test indexing via preset.'''

        pysam.tabix_index(self.tmpfilename, preset=self.preset)
        # check if uncompressed file has been removed
        self.assertEqual(os.path.exists(self.tmpfilename), False)
        checkBinaryEqual(self.tmpfilename + ".gz", self.filename)
        checkBinaryEqual(self.tmpfilename + ".gz.tbi", self.filename_idx)

Example #32

0

Show file

File: get_sequence_from_vcf.py Project: nlapalu/misc

    def indexingVariantFile(self, varFile):
        """Index variant file with Tabix"""

        logging.info('Trying to index file: {}'.format(varFile))
        pysam.tabix_index(varFile, force=True, preset="vcf")
        if not self.isVariantFileIndexed(varFile):
            raise Exception("Can not index file: {}".format(varFile))
        return True

Example #33

0

Show file

def get_refseq_allele(vcf_file, fasta_file, out_file, verbose=False):
    """
    Output a reference VCF file that contains the ref/alt allele coding 
    with the ref allele matching the reference sequence. This assumes that 
    all alleles are in the forward strand. If neither the ref or alt 
    allele is not found in the reference sequence, the alleles are output 
    as missing.
    
    Attributes
    ----------
    vcf_file : str
        VCF/BCF file name.
    fasta_file : str
        Fasta file name. Must be unzipped and have an fai index.
    out_file : str
        Output VCF file name.
    """

    rdr = fasta_fai.Reader(fasta_file)
    vcf_in = pysam.VariantFile(vcf_file, mode='r')

    vcf_out = pysam.VariantFile(out_file, mode='w')
    for r in vcf_in.header.records:
        vcf_out.header.add_record(r)

    counter = 0
    for rec in vcf_in.fetch():
        counter = counter + 1
        if verbose and counter % 1000 == 0:
            print(counter, "records")
        rec_out = vcf_out.new_record()
        rec_out.id = rec.id
        rec_out.pos = rec.pos
        rec_out.chrom = rec.chrom
        o = list(range(len(rec.alleles)))
        orv = list(reversed(o))
        ref_i = 0
        for i in orv:
            a = rec.alleles[i]
            refseq_base = rdr.get_seq(rec.chrom, rec.pos - 1, len(a))
            if a == refseq_base:
                ref_i = i
        o[ref_i] = 0
        o[0] = ref_i
        alleles = list()
        for i in o:
            alleles.append(rec.alleles[i])
        if (len(alleles) == 1):
            alleles.append('.')
        rec_out.alleles = tuple(alleles)
        alleles_set = set(rec.alleles)
        vcf_out.write(rec_out)
    rdr.close()
    vcf_in.close()
    vcf_out.close()
    if out_file[-3:] == ".gz":
        pysam.tabix_index(out_file, preset="vcf", force=True)

Example #34

0

Show file

 def test_indexing_with_lineskipping_works(self):
     '''test indexing via preset and lineskip.'''
     pysam.tabix_index(self.tmpfilename,
                       seq_col=0,
                       start_col=3,
                       end_col=4,
                       line_skip=1,
                       zerobased=False)
     self.assertFalse(checkBinaryEqual(self.tmpfilename + ".tbi", self.filename_idx))

Example #35

0

Show file

File: VariantFile_test.py Project: msto/pysam

 def test_vcf_with_tbi_index(self):
     with get_temp_context("tmp_fn.vcf") as fn:
         shutil.copyfile(self.vcf_filename, fn)
         pysam.tabix_index(fn, preset="vcf", force=True)
         self.assertTrue(os.path.exists(fn + ".gz" + ".tbi"))
         self.assertFalse(os.path.exists(fn + ".gz" + ".csi"))
         
         with pysam.VariantFile(fn + ".gz") as inf:
             self.assertEqual(len(list(inf.fetch("20"))), 3)

Example #36

0

Show file

File: customdb_prep.py Project: sicotteh/CAVA

def indexFile(input_file):
    sys.stdout.write('Compressing file... ')
    sys.stdout.flush()
    pysam.tabix_compress(input_file, input_file + '.gz', force=True)
    sys.stdout.write('OK\n')
    sys.stdout.write('Indexing output file... ')
    sys.stdout.flush()
    pysam.tabix_index(input_file + '.gz', seq_col=4, start_col=6, end_col=7, meta_char='#', force=True)
    sys.stdout.write('OK\n')

Example #37

0

Show file

    def test_vcf_with_tbi_index(self):
        with get_temp_context("tmp_fn.vcf") as fn:
            shutil.copyfile(self.vcf_filename, fn)
            pysam.tabix_index(fn, preset="vcf", force=True)
            self.assertTrue(os.path.exists(fn + ".gz" + ".tbi"))
            self.assertFalse(os.path.exists(fn + ".gz" + ".csi"))

            with pysam.VariantFile(fn + ".gz") as inf:
                self.assertEqual(len(list(inf.fetch("20"))), 3)

Example #38

0

Show file

File: run_nfr.py Project: JordiAlbert/NucleoATAC

def run_nfr(args):
    """run nfr calling

    """
    if args.bam is None and args.ins_track is None:
        raise Exception("Must supply either bam file or insertion track")
    if not args.out:
        args.out = '.'.join(os.path.basename(args.calls).split('.')[0:-3])
    if args.fasta is not None:
        chrs_fasta = read_chrom_sizes_from_fasta(args.fasta)
        pwm = PWM.open(args.pwm)
        chunks = ChunkList.read(args.bed, chromDict = chrs_fasta, min_offset = max(pwm.up, pwm.down))
    else:
        chunks = ChunkList.read(args.bed)
    if args.bam is not None:
        chrs_bam = read_chrom_sizes_from_bam(args.bam)
        chunks.checkChroms(chrs_bam, chrom_source = "BAM file") 
    chunks.merge()
    maxQueueSize = args.cores * 10 
    params = NFRParameters(args.occ_track, args.calls, args.ins_track, args.bam, max_occ = args.max_occ, max_occ_upper = args.max_occ_upper,
                            fasta = args.fasta, pwm = args.pwm)
    sets = chunks.split(items = args.cores * 5)
    pool1 = mp.Pool(processes = max(1,args.cores-1))
    nfr_handle = open(args.out + '.nfrpos.bed','w')
    nfr_handle.close()
    nfr_queue = mp.JoinableQueue()
    nfr_process = mp.Process(target = _writeNFR, args=(nfr_queue, args.out))
    nfr_process.start()
    if params.ins_track is None:
        ins_handle = open(args.out + '.ins.bedgraph','w')
        ins_handle.close()
        ins_queue = mp.JoinableQueue()
        ins_process = mp.Process(target = _writeIns, args=(ins_queue, args.out))
        ins_process.start()
    for j in sets:
        tmp = pool1.map(_nfrHelper, zip(j,itertools.repeat(params)))
        for result in tmp:
            if params.ins_track is None:
                nfr_queue.put(result[0])
                ins_queue.put(result[1])
            else:
                nfr_queue.put(result)
    pool1.close()
    pool1.join()
    nfr_queue.put('STOP')
    nfr_process.join()
    if params.ins_track is None:
        ins_queue.put('STOP')
        ins_process.join()
    pysam.tabix_compress(args.out + '.nfrpos.bed', args.out + '.nfrpos.bed.gz',force = True)
    shell_command('rm ' + args.out + '.nfrpos.bed')
    pysam.tabix_index(args.out + '.nfrpos.bed.gz', preset = "bed", force = True)
    if params.ins_track is None:
        pysam.tabix_compress(args.out + '.ins.bedgraph', args.out + '.ins.bedgraph.gz', force = True)
        shell_command('rm ' + args.out + '.ins.bedgraph')
        pysam.tabix_index(args.out + '.ins.bedgraph.gz', preset = "bed", force = True)

Example #39

0

Show file

File: tabix_test.py Project: msto/pysam

 def test_indexing_with_lineskipping_works(self):
     '''test indexing via preset and lineskip.'''
     pysam.tabix_index(self.tmpfilename,
                       seq_col=0,
                       start_col=3,
                       end_col=4,
                       line_skip=1,
                       zerobased=False)
     self.assertFalse(checkBinaryEqual(
         self.tmpfilename + ".tbi", self.filename_idx))

Example #40

0

Show file

File: make_non_somatic_panel.py Project: sunliang3361/RNAIndel

def make_non_somatic_panel(file_lst, panelname, genome, cosmic_db, cnt):

    indel_lst = filter_indels(file_lst, genome, cosmic_db, cnt)
    vcf_data = to_vcf_data(indel_lst)

    with open(panelname, "w") as f:
        f.write(vcf_header() + "\n")
        f.write("\n".join(vcf_data))

    pysam.tabix_index(panelname, preset="vcf")

Example #41

0

Show file

    def test_indexing_with_explict_columns_works(self):
        '''test indexing via preset.'''

        pysam.tabix_index(self.tmpfilename,
                          seq_col=0,
                          start_col=3,
                          end_col=4,
                          line_skip=0,
                          zerobased=False)
        self.assertTrue(checkBinaryEqual(self.tmpfilename + ".tbi", self.filename_idx))

Example #42

0

Show file

File: VariantFile_test.py Project: msto/pysam

    def testEmptyFileVCFGZWithIndex(self):
        with get_temp_context("tmp_testEmptyFile.vcf") as fn:
            with open(fn, "w"):
                pass
            # tabix_index will automatically compress
            pysam.tabix_index(fn,
                              preset="vcf",
                              force=True)

            self.assertRaises(ValueError, pysam.VariantFile, fn + ".gz")

Example #43

0

Show file

File: main_refseq.py Project: sicotteh/CAVA

def indexFile(f, options):
    sys.stdout.write(f'Compressing output file {f}... ')
    sys.stdout.flush()
    pysam.tabix_compress(os.path.join(options.output_dir, f), os.path.join(options.output_dir, f + '.gz'), force=True)
    sys.stdout.write('OK\n')
    sys.stdout.write(f'Indexing output file {f}... ')
    sys.stdout.flush()
    pysam.tabix_index(os.path.join(options.output_dir, f + '.gz'), seq_col=4, start_col=6, end_col=7, meta_char='#',
                      force=True)
    sys.stdout.write('OK\n')

Example #44

0

Show file

File: tabix_test.py Project: jmarshall/pysam

    def test_indexing_to_custom_location_works(self):
        '''test indexing a file with a non-default location.'''

        index_path = get_temp_filename(suffix='custom.tbi')
        pysam.tabix_index(self.tmpfilename,
                          preset="gff",
                          index=index_path,
                          force=True)
        self.assertTrue(checkGZBinaryEqual(index_path, self.filename_idx))
        os.unlink(index_path)

Example #45

0

Show file

    def _index_with_tabix(self):
        """Compress and index output file by Tabix"""

        pysam.tabix_compress(self._fn + '_tmp', self._fn + '.gz', force=True)
        pysam.tabix_index(self._fn + '.gz',
                          seq_col=self.idx_chrom,
                          start_col=self.idx_start,
                          end_col=self.idx_end,
                          meta_char='#',
                          force=True)

Example #46

0

Show file

def process_vcf(archive, vcf, vcf_index, output_prefix):
    """
    Extracts and processes the caveman vcf file.
    """
    out_raw_vcf = '{0}.tmp.vcf.gz'.format(output_prefix)
    logger.info("Extracting raw vcf to tmp file {0}".format(out_raw_vcf))
    extract_file(archive, vcf, out_raw_vcf)

    out_raw_vcf_index = '{0}.tmp.vcf.gz.tbi'.format(output_prefix)
    logger.info(
        "Extracting raw vcf index to tmp file {0}".format(out_raw_vcf_index))
    extract_file(archive, vcf_index, out_raw_vcf_index)

    # Update the sample name using BGZFile which doesn't assert any VCF format
    logger.info("Processing raw VCF to change TUMOUR -> TUMOR...")
    out_formatted_vcf = '{0}.vcf.gz'.format(output_prefix)
    logger.info("Creating final vcf {0}".format(out_formatted_vcf))
    writer = pysam.BGZFile(out_formatted_vcf, mode='wb')
    reader = pysam.BGZFile(out_raw_vcf, mode='rb')
    try:
        for line in reader:
            line = line.decode('utf-8')
            if line.startswith('##'):
                if line.startswith('##SAMPLE=<ID=TUMOUR'):
                    new_line = line.replace('ID=TUMOUR', 'ID=TUMOR') + '\n'
                    writer.write(new_line.encode('utf-8'))
                else:
                    new_line = line + '\n'
                    writer.write(new_line.encode('utf-8'))
            elif line.startswith('#CHROM'):
                new_line = line.replace('TUMOUR', 'TUMOR') + '\n'
                writer.write(new_line.encode('utf-8'))
            else:
                # BINF-306: fix rare case of alt == ref in caveman vcf.
                cols = line.split('\t')
                if cols[3] == cols[4]:
                    logger.warn(
                        "Removing loci {0}:{1} where ref and alt alleles are same: {2} - {3}"
                        .format(cols[0], cols[1], cols[3], cols[4]))
                    continue
                new_line = line + '\n'
                writer.write(new_line.encode('utf-8'))
    finally:
        writer.close()
        reader.close()

    # tabix index
    logger.info("Creating final vcf index {0}".format(out_formatted_vcf +
                                                      '.tbi'))
    pysam.tabix_index(out_formatted_vcf, preset='vcf', force=True)

    # clean up
    logger.info("Cleaning up tmp files...")
    os.remove(out_raw_vcf)
    os.remove(out_raw_vcf_index)

Example #47

0

Show file

File: tabix_test.py Project: msto/pysam

    def test_indexing_with_explict_columns_works(self):
        '''test indexing via preset.'''

        pysam.tabix_index(self.tmpfilename,
                          seq_col=0,
                          start_col=3,
                          end_col=4,
                          line_skip=0,
                          zerobased=False)
        self.assertTrue(checkBinaryEqual(
            self.tmpfilename + ".tbi", self.filename_idx))

Example #48

0

Show file

File: file_utils.py Project: statgen/pheweb

def convert_VariantFile_to_IndexedVariantFile(vf_path, ivf_path):
    make_basedir(ivf_path)
    tmp_path = get_tmp_path(ivf_path)
    pysam.tabix_compress(vf_path, tmp_path, force=True)
    os.rename(tmp_path, ivf_path)

    pysam.tabix_index(
        filename=ivf_path, force=True,
        seq_col=0, start_col=1, end_col=1, # note: `pysam.tabix_index` calls the first column `0`, but cmdline `tabix` calls it `1`.
        line_skip=1, # skip header
    )

Example #49

0

Show file

File: merge.py Project: JordiAlbert/NucleoATAC

def run_merge(args):
    if not args.out:
        args.out = '.'.join(os.path.basename(args.nucpos).split('.')[0:-3])
    occ = NucList.read(args.occpeaks, "occ", args.min_occ)
    nuc = NucList.read(args.nucpos, "nuc", args.min_occ)
    new = merge(occ, nuc, args.sep)
    out = open(args.out + '.nucmap_combined.bed','w')
    out.write(new.asBed())
    out.close()
    pysam.tabix_compress(args.out + '.nucmap_combined.bed', args.out + '.nucmap_combined.bed.gz',force = True)
    shell_command('rm ' + args.out + '.nucmap_combined.bed')
    pysam.tabix_index(args.out + '.nucmap_combined.bed.gz', preset = "bed", force = True)

Example #50

0

Show file

File: VariantFile_test.py Project: bayolau/pysam

    def testEmptyFileVCFGZWithIndex(self):
        with open("tmp_testEmptyFile.vcf", "w"):
            pass

        pysam.tabix_index("tmp_testEmptyFile.vcf",
                          preset="vcf",
                          force=True)

        self.assertRaises(ValueError, pysam.VariantFile,
                          "tmp_testEmptyFile.vcf.gz")

        os.unlink("tmp_testEmptyFile.vcf.gz")
        os.unlink("tmp_testEmptyFile.vcf.gz.tbi")

Example #51

0

Show file

File: TabixIndexer.py Project: Yixf-Self/oncotator

    def index(destDir, inputFilename, fileColumnNumList=None, preset=None):
        """
        Create a tabix index file for genomic position datasource tsv files.
        Prerequisites (for genomic position indexed):
            Input file has three columns that can be mapped to chromosome, start position, and end position without any modification.
                For example, ['hg19.oreganno.chrom', 'hg19.oreganno.chromStart', 'hg19.oreganno.chromEnd'] in oreganno.hg19.txt

        This will overwrite an existing index (since the force parameter is set to True in pysam.tabix_index() call).
        Also, in cases where the inputFilename doesn't end with a ".gz", the a compressed file will be created and indexed.

        :param destDir: destination directory
        :param ds_foldername: destination folder name
        :param fileColumnNumList: ordered list.  This list contains the corresponding entries (column numbers)
            in the tsv file. Typically, this would be [chr,start,end]  or [gene, startAA, endAA]
        :param inputFilename: tsv file input
        :param preset: if preset is provided, the column coordinates are taken from a preset. Valid values for preset
        are "gff", "bed", "sam", "vcf", "psltbl", and "pileup".
        """
        fileColumnNumList = [] if fileColumnNumList is None else fileColumnNumList
        inputFilename = os.path.abspath(inputFilename)
        fileDir = os.path.dirname(inputFilename)
        fileName, fileExtension = os.path.splitext(os.path.basename(inputFilename))

        if fileExtension in (".gz",):
            # Ensure .gz.tbi file is there as well
            inputIndexFilename = os.path.join(fileDir, string.join([inputFilename, "tbi"], "."))
            if not os.path.exists(inputIndexFilename):
                msg = "Missing tabix index file %s." % inputIndexFilename
                raise TabixIndexerFileMissingError(msg)

            outputFilename = os.path.join(destDir, string.join([fileName, "gz"], "."))
            shutil.copyfile(inputFilename, outputFilename)

            outputIndexFilename = os.path.join(destDir, string.join([fileName, "gz", "tbi"], "."))
            shutil.copyfile(inputIndexFilename, outputIndexFilename)

            return outputFilename

        outputFilename = os.path.join(destDir, string.join([fileName, ".tabix_indexed", fileExtension], ""))
        # Copy the input file to output file.
        shutil.copyfile(inputFilename, outputFilename)

        # Load the file into a tsvReader.
        if preset in ("gff", "bed", "sam", "vcf", "psltbl", "pileup"):
            tabix_index = pysam.tabix_index(filename=outputFilename, force=True, preset=preset)
        else:
            # Have to specify min_size=0 in pysam 0.8.1 to get pysam to correctly output a .tbi file 
            tabix_index = pysam.tabix_index(filename=outputFilename, force=True, seq_col=fileColumnNumList[0],
                                            start_col=fileColumnNumList[1], end_col=fileColumnNumList[2])

        return tabix_index

Example #52

0

Show file

File: tabix.py Project: HaoKuo/bam2x

def run(args):
    fin=IO.fopen(args.input,"r")
    outfile=args.input
    if not args.sorted:
        l = [ i for i in TableIO.parse(fin,args.format) ]
        l.sort()
        name=splitext(args.input)
        outfile = "{name[0]}.sorted{name[1]}".format(name=name)
        out = IO.fopen(outfile,"w")
        for i in l:
            print(i,file=out)
        out.close()
    format=args.format.translate(None,digits)
    tabix_index(outfile,preset=format)

Example #53

0

Show file

File: interval_to_tabix_converter.py Project: ImmPortDB/immport-galaxy

def to_tabix(bgzip_fname, out_fname, preset=None, chrom_col=None, start_col=None, end_col=None):
    # Create index.
    if preset:
        # Preset type.
        bgzip_fname = pysam.tabix_index(filename=bgzip_fname, preset=preset, keep_original=True,
                                        index=out_fname, force=True)
    else:
        # For interval files; column indices are 0-based.
        bgzip_fname = pysam.tabix_index(filename=bgzip_fname, seq_col=(chrom_col - 1),
                                        start_col=(start_col - 1), end_col=(end_col - 1),
                                        keep_original=True, index=out_fname, force=True)
    if os.path.getsize(out_fname) == 0:
        sys.stderr.write("The converted tabix index file is empty, meaning the input data is invalid.")
    return bgzip_fname

Example #54

0

Show file

File: vcf_utils.py Project: thongnt2/metasv

def merge_vcfs(in_vcfs_dir, contigs, out_vcf):
    logger.info("Mergings per-chromosome VCFs from %s" % in_vcfs_dir)
    header_done = False
    out_vcf_file = open(out_vcf, "w")
    for contig in contigs:
        chr_vcf = os.path.join(in_vcfs_dir, "%s.vcf.gz" % contig.name)
        if os.path.isfile(chr_vcf):
            chr_tabix_file = pysam.Tabixfile(chr_vcf)
            if not header_done:
                print_header(chr_tabix_file.header, out_vcf_file)
            for entry in chr_tabix_file.fetch():
                out_vcf_file.write("%s\n" % entry)
            chr_tabix_file.close()
    out_vcf_file.close()
    pysam.tabix_index(out_vcf, force=True, preset="vcf")

Example #55

0

Show file

File: dbprep.py Project: RahmanTeam/OpEx

def indexFile(options):
    filename=options.output
    if not options.ensembl is None:
        sys.stdout.write('Compressing output file... ') 
        sys.stdout.flush()
        pysam.tabix_compress(filename,filename+'.gz',force=True)
        sys.stdout.write('OK\n') 	
        sys.stdout.write('Indexing output file... ') 
        sys.stdout.flush()
        pysam.tabix_index(filename+'.gz', seq_col=2, start_col=4, end_col=5, meta_char='#',force=True)
        sys.stdout.write('OK\n')
    else:
        print 'Compressing file...'
        pysam.tabix_compress(filename,filename+'.gz',force=True)
        print 'Indexing file...'
        pysam.tabix_index(filename+'.gz', seq_col=1, start_col=2, end_col=2, meta_char='#',force=True)

Example #56

0

Show file

File: gtf.py Project: soh-i/Ivy

 def _prepare(self):
     if not os.path.isfile(self.ingtf + ".gz.tbi"):
         print "Generate indexed GTF (tabix) file: '{0}'...".format(self.ingtf)
         compressed_gtf = pysam.tabix_index(self.ingtf, preset="gff")
     else:
         compressed_gtf = self.ingtf + ".gz"
     self.tabixfile = pysam.Tabixfile(compressed_gtf)

Example #57

0

Show file

File: prepare_compliance_data.py Project: holtlab/server

 def addVariantSet(self, variantFileName, dataset, referenceSet, ontology):
     inputVcf = os.path.join(
         self.inputDirectory, variantFileName)
     outputVcf = os.path.join(
         self.outputDirectory, variantFileName)
     shutil.copy(inputVcf, outputVcf)
     pysam.tabix_index(outputVcf, preset="vcf")
     variantSet = variants.HtslibVariantSet(
         dataset, variantFileName.split('_')[1])
     variantSet.setReferenceSet(referenceSet)
     variantSet.populateFromFile(
         [outputVcf + ".gz"], [outputVcf + ".gz.tbi"])
     variantSet.checkConsistency()
     self.repo.insertVariantSet(variantSet)
     for annotationSet in variantSet.getVariantAnnotationSets():
         annotationSet.setOntology(ontology)
         self.repo.insertVariantAnnotationSet(annotationSet)

Example #58

0

Show file

File: g2g_fileutils.py Project: churchill-lab/g2gtools

def bgzip_index(original_file, new_file, file_format):
    """

    :param original_file:
    :param new_file:
    :param file_format:
    :return:
    """

    if file_format.lower() == 'fa':
        tabix_compress(original_file, new_file)
        faidx(new_file)
        delete_file(original_file)
    elif file_format.lower() == 'vcf':
        tabix_index(original_file, preset="vcf", force=True)
    else:
        raise G2GValueError("Unknown file format: {0}".format(file_format))

Example #59

0

Show file

File: control.py Project: bujaraty/trainer-and-predictor

 def __tabix(self, file_name):
     """ tabix into gz and tbi file """
     return pysam.tabix_index(file_name,
                              force     = True,
                              seq_col   = combivep_settings.LJB_PARSED_0_INDEX_CHROM,
                              start_col = combivep_settings.LJB_PARSED_0_INDEX_POS,
                              end_col   = combivep_settings.LJB_PARSED_0_INDEX_POS,
                              zerobased = False)

Example #60

0

Show file

File: tmpmod.py Project: andrewparkermorgan/lapels

def getTabixMod(filename):
    '''Unzip a mod file, use bgzip to rezip it, and and build tabix index.'''
    logger = logging.getLogger('tmpmod') 
    logger.info('extracting MOD file ...')   
    modfp = gzip.open(filename, 'rb')
    tmpName = tempfile.mkstemp('.tsv')[1]    
    tmpfp = open(tmpName, 'wb')
    tmpfp.writelines(modfp)
    tmpfp.close()
    modfp.close()    
    pysam.tabix_index(tmpName, force=True, seq_col=1, start_col=2, end_col=2, 
                      meta_char='#', zerobased=True)
    tmpName += '.gz'
    logger.info('temporary file %s created', tmpName)
    return tmpName

#print(getTabixMod("../data/B.mod"))