def main(args, logger):
    """
    Main wrapper script for removing non-standard variants
    """
    # Allowed
    good = set(['A', 'T', 'C', 'G'])

    # Reader
    reader = pysam.VariantFile(args.input_vcf)

    # Writer
    mode = 'wz' if args.output_filename.endswith('gz') else 'w'
    writer = pysam.VariantFile(args.output_filename,
                               mode=mode,
                               header=reader.header)

    # Process
    try:
        for record in reader.fetch():
            alleles = list(record.alleles)
            alleles_set = set(list(''.join(alleles).upper()))
            check = alleles_set - good
            if check:
                logger.warning('Removing %s:%s:%s', record.chrom, record.pos,
                               ','.join(alleles))
                continue
            else:
                writer.write(record)

    finally:
        reader.close()
        writer.close()

    if mode == 'wz':
        pysam.tabix_index(args.output_filename, preset='vcf', force=True)
Example #2
0
    def eff_vcf(self, inVcf, outVcf, genome, java_flags='-Xmx2g',
            in_format='vcf', out_format='vcf', eff_options=''):
        """
        TODO: docstring here
        """
        if outVcf.endswith('.vcf.gz'):
            tmpVcf = util.file.mkstempfname(prefix='vcf_snpEff-', suffix='.vcf')
        else:
            tmpVcf = outVcf

        args = ' '.join([
                'eff',
                    '-c', '{}/snpEff.config'.format(self.executable_path()),
                    '-i', in_format,
                    '-o', out_format,
                    genome,
                    '-treatAllAsProteinCoding false',
                    '-noLog',
                    '-ud 0',
                    '-noStats',
                    eff_options
                ])

        if inVcf.endswith('.gz'):
            pre_pipe = "zcat {} | ".format(inVcf)
        else:
            pre_pipe = "cat {} | ".format(inVcf)
        post_pipe = " > {}".format(tmpVcf)
        self.execute(args, java_flags=java_flags, pre_pipe=pre_pipe,
                post_pipe=post_pipe)
        
        if outVcf.endswith('.vcf.gz'):
            pysam.tabix_compress(tmpVcf, outVcf, force=True)
            pysam.tabix_index(outVcf, force=True, preset='vcf')
            os.unlink(tmpVcf)
Example #3
0
    def annotate_vcf(self, inVcf, genome, outVcf, JVMmemory=None):
        """
        Annotate variants in VCF file with translation consequences using snpEff.
        """
        if outVcf.endswith('.vcf.gz'):
            tmpVcf = util.file.mkstempfname(prefix='vcf_snpEff-', suffix='.vcf')
        elif outVcf.endswith('.vcf'):
            tmpVcf = outVcf
        else:
            raise Exception("invalid input")

        args = [
            '-treatAllAsProteinCoding', 'false',
            '-t',
            '-noLog',
            '-ud', '0',
            '-noStats',
            '-noShiftHgvs',
            genome,
            inVcf
            ]
        with open(tmpVcf, 'wt') as outf:
            self.execute('ann', args, JVMmemory=JVMmemory, stdout=outf)
        
        if outVcf.endswith('.vcf.gz'):
            pysam.tabix_compress(tmpVcf, outVcf, force=True)
            pysam.tabix_index(outVcf, force=True, preset='vcf')
            os.unlink(tmpVcf)
Example #4
0
def indexFile(options):
    filename = options.output
    if not options.ensembl is None:
        sys.stdout.write('Compressing output file... ')
        sys.stdout.flush()
        pysam.tabix_compress(filename, filename + '.gz', force=True)
        sys.stdout.write('OK\n')
        sys.stdout.write('Indexing output file... ')
        sys.stdout.flush()
        pysam.tabix_index(filename + '.gz',
                          seq_col=2,
                          start_col=4,
                          end_col=5,
                          meta_char='#',
                          force=True)
        sys.stdout.write('OK\n')
    else:
        print 'Compressing file...'
        pysam.tabix_compress(filename, filename + '.gz', force=True)
        print 'Indexing file...'
        pysam.tabix_index(filename + '.gz',
                          seq_col=1,
                          start_col=2,
                          end_col=2,
                          meta_char='#',
                          force=True)
Example #5
0
def ensureIndexed(bedPath, preset="bed", trySorting=True):
    if not bedPath.endswith(".gz"):
        if not os.path.exists(bedPath + ".gz"):
            logging.info("bgzf compressing {}".format(bedPath))
            pysam.tabix_compress(bedPath, bedPath + ".gz")
            if not os.path.exists(bedPath + ".gz"):
                raise Exception(
                    "Failed to create compress {preset} file for {file}; make sure the {preset} file is "
                    "sorted and the directory is writeable".format(preset=preset, file=bedPath)
                )
        bedPath += ".gz"
    if not os.path.exists(bedPath + ".tbi"):
        logging.info("creating tabix index for {}".format(bedPath))
        pysam.tabix_index(bedPath, preset=preset)
        if not os.path.exists(bedPath + ".tbi"):
            raise Exception(
                "Failed to create tabix index file for {file}; make sure the {preset} file is "
                "sorted and the directory is writeable".format(preset=preset, file=bedPath)
            )

    line = pysam.Tabixfile(bedPath).fetch().next()
    if len(line.strip().split("\t")) < 6 and preset == "bed":
        raise AnnotationError(
            "BED files need to have at least 6 (tab-delimited) fields (including "
            "chrom, start, end, name, score, strand; score is unused)"
        )
    if len(line.strip().split("\t")) < 9 and preset == "gff":
        raise AnnotationError("GFF/GTF files need to have at least 9 tab-delimited fields")

    return bedPath
Example #6
0
    def testIndexPresetCompressed(self):
        '''test indexing via preset.'''

        pysam.tabix_compress(self.tmpfilename, self.tmpfilename + ".gz")
        pysam.tabix_index(self.tmpfilename + ".gz", preset=self.preset)
        checkBinaryEqual(self.tmpfilename + ".gz", self.filename)
        checkBinaryEqual(self.tmpfilename + ".gz.tbi", self.filename_idx)
 def batchTestHelper(self, modFile, pool, refLens):                
     tmpName = tempfile.mkstemp('.tsv')[1]
     tmpfp = open(tmpName, 'wb')
     for line in modFile:
         tmpfp.write(line)
     tmpfp.close()
     pysam.tabix_index(tmpName, force=True, seq_col=1, start_col=2, end_col=2, 
                   meta_char='#', zerobased=True)
     tmpName += '.gz'
     modFile.close()
     
     self.chromoID = '1'
     self.modobj = mod.Mod(tmpName)
     self.modobj.load(self.chromoID)
     
     for tup in pool:       
         bamIter=[Read(tup[0], tup[1]+1, tup[2]) for tup in pool]        
                                
     a = annot.Annotator(self.chromoID, refLens[self.chromoID],
                             self.modobj, bamIter)
     results = a.execute()
     
     for i,res in enumerate(results):            
         self.assertEqual(polish(res[0]),pool[i][3])
         self.assertEqual(res[1], pool[i][4])
         self.assertEqual(res[2], pool[i][5])
         self.assertEqual(res[3], pool[i][6])
         self.assertEqual(res[4], pool[i][7])
     
     os.remove(tmpName)
     os.remove(tmpName+'.tbi')
Example #8
0
def make_bias_track(args, bases = 500000, splitsize = 1000):
    """function to compute bias track

    """
    if args.out is None:
        if args.bed is not None:
            args.out = '.'.join(os.path.basename(args.bed).split('.')[0:-1])
        else:
            args.out = '.'.join(os.path.basename(args.fasta).split('.')[0:-1])
    params = _BiasParams(args.fasta, args.pwm)
    if args.bed is None:
        chunks = ChunkList.convertChromSizes(params.chrs, splitsize = splitsize)
        sets = chunks.split(items = bases/splitsize)
    else:
        chunks = ChunkList.read(args.bed)
        chunks.merge()
        sets = chunks.split(bases = bases)
    maxQueueSize = max(2,int(2 * bases / np.mean([chunk.length() for chunk in chunks])))
    pool = mp.Pool(processes = max(1,args.cores-1))
    out_handle = open(args.out + '.Scores.bedgraph','w')
    out_handle.close()
    write_queue = mp.JoinableQueue(maxsize = maxQueueSize)
    write_process = mp.Process(target = _writeBias, args=(write_queue, args.out))
    write_process.start()
    for j in sets:
        tmp = pool.map(_biasHelper, zip(j,itertools.repeat(params)))
        for track in tmp:
            write_queue.put(track)
    pool.close()
    pool.join()
    write_queue.put('STOP')
    write_process.join()
    pysam.tabix_compress(args.out + '.Scores.bedgraph', args.out + '.Scores.bedgraph.gz', force = True)
    shell_command('rm ' + args.out + '.Scores.bedgraph')
    pysam.tabix_index(args.out + '.Scores.bedgraph.gz', preset = "bed", force = True)
Example #9
0
    def testIndexPresetCompressed(self):
        '''test indexing via preset.'''

        pysam.tabix_compress(self.tmpfilename, self.tmpfilename + ".gz")
        pysam.tabix_index(self.tmpfilename + ".gz", preset=self.preset)
        checkBinaryEqual(self.tmpfilename + ".gz", self.filename)
        checkBinaryEqual(self.tmpfilename + ".gz.tbi", self.filename_idx)
Example #10
0
    def _make_assembly_vcf(self):
        tmp_vcf = self.final_assembly_vcf + '.tmp'
        cmd = ' '.join([
            self.samtools_exe, 'mpileup', '-t INFO/DPR,DV', '-A', '-f',
            self.final_assembly_fa, '-u', '-v', self.final_assembly_bam, '>',
            tmp_vcf
        ])

        common.syscall(cmd, verbose=self.verbose)

        cmd = ' '.join([
            self.bcftools_exe, 'call -m', tmp_vcf, '|', self.bcftools_exe,
            'query', r'''-f '%CHROM\t%POS\t%REF\t%ALT\t%DP\t%DPR]\n' ''', '>',
            self.final_assembly_read_depths + '.tmp'
        ])

        common.syscall(cmd, verbose=self.verbose)
        pysam.tabix_compress(self.final_assembly_read_depths + '.tmp',
                             self.final_assembly_read_depths)
        pysam.tabix_index(self.final_assembly_read_depths,
                          seq_col=0,
                          start_col=1,
                          end_col=1)
        os.unlink(self.final_assembly_read_depths + '.tmp')

        cmd = ' '.join([
            self.bcftools_exe, 'call -m -v', tmp_vcf, '|', self.bcftools_exe,
            'filter', '-i', '"MIN(DP)>=' + str(self.bcf_min_dp),
            ' & MIN(DV)>=' + str(self.bcf_min_dv),
            ' & MIN(DV/DP)>=' + str(self.bcf_min_dv_over_dp), ' & QUAL >=',
            str(self.bcf_min_qual), '"', '-o', self.final_assembly_vcf
        ])

        common.syscall(cmd, verbose=self.verbose)
        os.unlink(tmp_vcf)
Example #11
0
    def addVariantSet(
            self, variantFileName, dataset, referenceSet,
            ontology, biosamples):
        inputVcf = os.path.join(
            self.inputDirectory, variantFileName)
        outputVcf = os.path.join(
            self.outputDirectory, variantFileName)
        shutil.copy(inputVcf, outputVcf)
        pysam.tabix_index(outputVcf, preset="vcf")
        variantSet = variants.HtslibVariantSet(
            dataset, variantFileName.split('_')[1])
        variantSet.setReferenceSet(referenceSet)
        variantSet.populateFromFile(
            [os.path.abspath(outputVcf + ".gz")],
            [os.path.abspath(outputVcf + ".gz.tbi")])
        variantSet.checkConsistency()
        for callSet in variantSet.getCallSets():
            for biosample in biosamples:
                if biosample.getLocalId() == callSet.getLocalId():
                    callSet.setBiosampleId(biosample.getId())
        self.repo.insertVariantSet(variantSet)

        for annotationSet in variantSet.getVariantAnnotationSets():
            annotationSet.setOntology(ontology)
            self.repo.insertVariantAnnotationSet(annotationSet)
Example #12
0
def classDisbyMotif(paras):
    path_dis = paras["output_dis"]
    path_dis_parameter = paras["output_tmp"]
    min_support_reads = paras["minimum_support_reads"]
    print("[INFO] Scanning the distribution file of microsatellite!")
    vcffile = pysam.VariantFile(path_dis)
    File_motif = {}

    recordNum = 0
    for rec in vcffile.fetch():
        recordNum += 1
        recordInfo = rec.info
        motif = recordInfo["motif"]
        support_reads = int(recordInfo["support_reads"].split("|")[0])
        if support_reads > min_support_reads:
            if motif not in File_motif:
                File_motif[motif] = pysam.VariantFile(path_dis_parameter + "/tmp_motif_" + motif + ".vcf.gz", 'w',
                                                      header=vcffile.header)
            File_motif[motif].write(rec)

    motifList = []
    for motif in File_motif:
        File_motif[motif].close()
        pysam.tabix_index(path_dis_parameter + "/tmp_motif_" + motif + ".vcf.gz", force=True, preset="vcf")

        motifList.append(motif)
    set_value("motifList", motifList)
Example #13
0
    def make_index(file_name):
        """Make index file for input file"""
        f_bs, f_ext = os.path.splitext(file_name)

        def indexed(fn, ext):
            return os.path.exists(fn + ext)

        def uptodate(fn, ext):
            return os.getmtime(fn) < os.getmtime(fn + ext)

        infomsg = "{} was indexed and is uptodate. Skipping".format(file_name)
        if f_ext == ".fa":
            if indexed(file_name, ".fai") and uptodate(file_name, ".fai"):
                print(infomsg)
            else:
                pysam.faidx(file_name)
        elif f_ext in [".bam", ".cram"]:
            if indexed(file_name, ".bai") and uptodate(file_name, ".bai"):
                print(infomsg)
            else:
                pysam.index(file_name)
        elif f_ext in [".gff", ".bed", ".vcf", ".sam"]:
            if indexed(file_name, ".gz.tbi") and uptodate(
                    file_name, ".gz.tbi"):
                print(infomsg)
            else:
                pysam.tabix_index(file_name, preset=f_ext.replace(".", ""))
Example #14
0
def get_cov(args, bases = 50000, splitsize = 1000):
    """function to get coverages

    """
    if not args.out:
        if args.bed is None:
            args.out = '.'.join(os.path.basename(args.bam).split('.')[0:-1])
        else:
            args.out = '.'.join(os.path.basename(args.bed).split('.')[0:-1])
    if args.bed is None:
        chrs = read_chrom_sizes_from_bam(args.bam)
        chunks = ChunkList.convertChromSizes(chrs, splitsize = splitsize)
        sets = chunks.split(items = bases/splitsize)
    else:
        chunks = ChunkList.read(args.bed)
        chunks.merge()
        sets = chunks.split(bases = bases)
    maxQueueSize = max(2,int(2 * bases / np.mean([chunk.length() for chunk in chunks])))
    pool1 = mp.Pool(processes = max(1,args.cores-1))
    out_handle = open(args.out + '.cov.bedgraph','w')
    out_handle.close()
    write_queue = mp.JoinableQueue(maxsize = maxQueueSize)
    write_process = mp.Process(target = _writeCov, args=(write_queue, args.out))
    write_process.start()
    for j in sets:
        tmp = pool1.map(_covHelper, zip(j,itertools.repeat(args)))
        for track in tmp:
            write_queue.put(track)
    pool1.close()
    pool1.join()
    write_queue.put('STOP')
    write_process.join()
    pysam.tabix_compress(args.out + '.cov.bedgraph', args.out + '.cov.bedgraph.gz', force = True)
    shell_command('rm ' + args.out + '.cov.bedgraph')
    pysam.tabix_index(args.out + '.cov.bedgraph.gz', preset = "bed", force = True)
Example #15
0
def class_dis_by_motif(paras):
    path_pre = paras["output_pre"]
    path_dis_parameter = paras["output_tmp"]
    min_support_reads = paras["minimum_support_reads"]
    logger.info("Scanning the distribution file of microsatellite!")
    vcf_file = pysam.VariantFile(path_pre)
    files_motif = {}
    record_num = 0
    for rec in vcf_file.fetch():
        record_num += 1
        record_info = rec.info
        motif = record_info["motif"]
        support_reads = record_info["depth"]
        if support_reads > min_support_reads:
            if motif not in files_motif:
                files_motif[motif] = pysam.VariantFile(
                    path_dis_parameter + "/tmp_motif_" + motif + ".vcf.gz",
                    'w',
                    header=vcf_file.header)
            files_motif[motif].write(rec)

    motif_list = []
    for motif in files_motif:
        files_motif[motif].close()
        pysam.tabix_index(path_dis_parameter + "/tmp_motif_" + motif +
                          ".vcf.gz",
                          force=True,
                          preset="vcf")

        motif_list.append(motif)
    set_value("motif_list", motif_list)
def main():
    # Read options, args.
    usage = "Usage: %prog [options] tabular_input_file bgzip_output_file"
    parser = optparse.OptionParser(usage=usage)
    parser.add_option('-c',
                      '--chr-col',
                      type='int',
                      default=0,
                      dest='chrom_col')
    parser.add_option('-s',
                      '--start-col',
                      type='int',
                      default=1,
                      dest='start_col')
    parser.add_option('-e', '--end-col', type='int', default=1, dest='end_col')
    (options, args) = parser.parse_args()
    if len(args) != 2:
        parser.print_usage()
        exit(1)
    input_fname, output_fname = args
    output_dir = os.path.dirname(output_fname)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    pysam.tabix_compress(input_fname, output_fname, force=True)
    # Column indices are 0-based.
    pysam.tabix_index(output_fname,
                      seq_col=options.chrom_col,
                      start_col=options.start_col,
                      end_col=options.end_col)
def main():
    # Read options, args.
    parser = optparse.OptionParser()
    parser.add_option('-c', '--chr-col', type='int', dest='chrom_col')
    parser.add_option('-s', '--start-col', type='int', dest='start_col')
    parser.add_option('-e', '--end-col', type='int', dest='end_col')
    parser.add_option('-P', '--preset', dest='preset')
    (options, args) = parser.parse_args()
    input_fname, index_fname, out_fname = args

    # Create index.
    if options.preset:
        # Preset type.
        pysam.tabix_index(filename=index_fname,
                          preset=options.preset,
                          keep_original=True,
                          index_filename=out_fname)
    else:
        # For interval files; column indices are 0-based.
        pysam.tabix_index(filename=index_fname,
                          seq_col=(options.chrom_col - 1),
                          start_col=(options.start_col - 1),
                          end_col=(options.end_col - 1),
                          keep_original=True,
                          index_filename=out_fname)
    if os.path.getsize(index_fname) == 0:
        sys.stderr.write(
            "The converted tabix index file is empty, meaning the input data is invalid."
        )
Example #18
0
def compressVcf(vcfname,forceflag=False,remove=False):
    """Runs bgzip and tabix on input VCF file.

    Using the pysam library, this function runs the bgzip and tabix utilities
    on the given input file. By default, this will not overwrite an existing
    zipped file, but will overwrite an existing index. `remove` can be set to
    delete the unzipped file.

    Parameters
    ----------
    vcfname : str
        Name of uncompressed VCF file
    forceflag : bool (False)
        If true, will overwrite (vcfname).gz if it exists
    remove : bool (False)
        If true, will delete uncompressed source file

    Returns
    -------
    cvcfname : str
        Filepath to compressed VCF file
    """
    cvcfname = vcfname+".gz"
    pysam.tabix_compress(vcfname,cvcfname,force=forceflag)
    pysam.tabix_index(cvcfname,preset="vcf",force=True)
    if remove:
        os.remove(vcfname)
    return cvcfname
Example #19
0
def ensureIndexed(bedPath, preset="bed", trySorting=True):
    if not bedPath.endswith(".gz"):
        if not os.path.exists(bedPath + ".gz"):
            logging.info("bgzf compressing {}".format(bedPath))
            pysam.tabix_compress(bedPath, bedPath + ".gz")
            if not os.path.exists(bedPath + ".gz"):
                raise Exception(
                    "Failed to create compress {preset} file for {file}; make sure the {preset} file is "
                    "sorted and the directory is writeable".format(
                        preset=preset, file=bedPath))
        bedPath += ".gz"
    if not os.path.exists(bedPath + ".tbi"):
        logging.info("creating tabix index for {}".format(bedPath))
        pysam.tabix_index(bedPath, preset=preset)
        if not os.path.exists(bedPath + ".tbi"):
            raise Exception(
                "Failed to create tabix index file for {file}; make sure the {preset} file is "
                "sorted and the directory is writeable".format(preset=preset,
                                                               file=bedPath))

    line = pysam.Tabixfile(bedPath).fetch().next()
    if len(line.strip().split("\t")) < 6 and preset == "bed":
        raise AnnotationError(
            "BED files need to have at least 6 (tab-delimited) fields (including "
            "chrom, start, end, name, score, strand; score is unused)")
    if len(line.strip().split("\t")) < 9 and preset == "gff":
        raise AnnotationError(
            "GFF/GTF files need to have at least 9 tab-delimited fields")

    return bedPath
Example #20
0
def to_tabix(bgzip_fname,
             out_fname,
             preset=None,
             chrom_col=None,
             start_col=None,
             end_col=None):
    # Create index.
    if preset:
        # Preset type.
        bgzip_fname = pysam.tabix_index(filename=bgzip_fname,
                                        preset=preset,
                                        keep_original=True,
                                        index=out_fname,
                                        force=True)
    else:
        # For interval files; column indices are 0-based.
        bgzip_fname = pysam.tabix_index(filename=bgzip_fname,
                                        seq_col=(chrom_col - 1),
                                        start_col=(start_col - 1),
                                        end_col=(end_col - 1),
                                        keep_original=True,
                                        index=out_fname,
                                        force=True)
    if os.path.getsize(out_fname) == 0:
        sys.exit(
            "The converted tabix index file is empty, meaning the input data is invalid."
        )
    return bgzip_fname
Example #21
0
def run(argv):

    if '-h' in argv or '--help' in argv:
        print('Make a single large tabixed file of all phenotypes data')
        exit(1)

    if should_run():
        # we don't need `ffi.new('char[]', ...)` because args are `const`
        ret = lib.cffi_make_matrix(
            sites_filepath.encode('utf8'),
            common_filepaths['pheno']('*').encode('utf8'),
            matrix_gz_tmp_filepath.encode('utf8'))
        ret_bytes = ffi.string(ret, maxlen=1000)
        if ret_bytes != b'ok':
            raise PheWebError(
                'The portion of `pheweb matrix` written in c++/cffi failed with the message '
                + repr(ret_bytes))
        os.rename(matrix_gz_tmp_filepath, matrix_gz_filepath)
        pysam.tabix_index(
            filename=matrix_gz_filepath,
            force=True,
            seq_col=0,
            start_col=1,
            end_col=1  # note: these are 0-based, but `/usr/bin/tabix` is 1-based
        )
    else:
        print('matrix is up-to-date!')
Example #22
0
def tabix_bedgraph(bedgraph):
    pysam.tabix_compress(bedgraph, bedgraph + '.gz')
    pysam.tabix_index(bedgraph + '.gz',
                      seq_col=0,
                      start_col=1,
                      end_col=2,
                      zerobased=True)
Example #23
0
    def addVariantSet(
            self, variantFileName, dataset, referenceSet,
            ontology, biosamples):
        inputVcf = os.path.join(
            self.inputDirectory, variantFileName)
        outputVcf = os.path.join(
            self.outputDirectory, variantFileName)
        shutil.copy(inputVcf, outputVcf)
        pysam.tabix_index(outputVcf, preset="vcf")
        variantSet = variants.HtslibVariantSet(
            dataset, variantFileName.split('_')[1])
        variantSet.setReferenceSet(referenceSet)
        variantSet.populateFromFile(
            [os.path.abspath(outputVcf + ".gz")],
            [os.path.abspath(outputVcf + ".gz.tbi")])
        variantSet.checkConsistency()
        for callSet in variantSet.getCallSets():
            for biosample in biosamples:
                if biosample.getLocalId() == callSet.getLocalId():
                    callSet.setBiosampleId(biosample.getId())
        self.repo.insertVariantSet(variantSet)

        for annotationSet in variantSet.getVariantAnnotationSets():
            annotationSet.setOntology(ontology)
            self.repo.insertVariantAnnotationSet(annotationSet)
    def gff32tabix(self, file_sorted_gff3, file_sorted_gz_gff3, file_gff3_tbi):  # pylint: disable=no-self-use,unused-argument
        """
        GFF3 to Tabix

        Compresses the sorted GFF3 file and then uses Tabix to generate an index
        of the GFF3 file.

        Parameters
        ----------
        file_sorted_gff3 : str
            Location of a sorted GFF3 file
        file_sorted_gz_gff3 : str
            Location of the bgzip compressed GFF3 file
        file_gff3_tbi : str
            Location of the Tabix index file

        Example
        -------
        .. code-block:: python
           :linenos:

           if not self.gff32tabix(self, file_sorted_gff3, gz_file, tbi_file):
               output_metadata.set_exception(
                   Exception(
                       "gff32tabix: Could not process files {}, {}.".format(*input_files)))
        """
        pysam.tabix_compress(file_sorted_gff3, file_sorted_gz_gff3)  # pylint: disable=no-member
        pysam.tabix_index(file_sorted_gz_gff3, preset='gff')  # pylint: disable=no-member
        return True
Example #25
0
    def annotate_vcf(self, inVcf, genomes, outVcf, emailAddress, JVMmemory=None):
        """
        Annotate variants in VCF file with translation consequences using snpEff.
        """
        if outVcf.endswith('.vcf.gz'):
            tmpVcf = util.file.mkstempfname(prefix='vcf_snpEff-', suffix='.vcf')
        elif outVcf.endswith('.vcf'):
            tmpVcf = outVcf
        else:
            raise Exception("invalid input")

        sortedAccessionString = ", ".join(sorted(genomes))
        databaseId = hashlib.sha256(sortedAccessionString.encode('utf-8')).hexdigest()[:55]

        genomeToUse = ""

        # if we don't have the genome, by name (snpEff official) or by hash (custom)
        if (not self.has_genome(databaseId)):
            if (not self.has_genome(genomes[0])):
                _log.info("Checking for snpEff database online...")
                # check to see if it is available for download, and if so install it
                for row in self.available_databases():
                    if (genomes[0].lower() in row['Genome'].lower()) or (
                        genomes[0].lower() in row['Bundle'].lower()
                    ) or (
                        genomes[0].lower() in row['Organism'].lower()
                    ):
                        self.download_db(row['Genome'])

        # backward compatability for where a single genome name is provided
        if self.has_genome(genomes[0]):
            genomeToUse = genomes[0]
        else:
            # if the hash of the accessions passed in is not present in the genomes db
            if not self.has_genome(databaseId):
                self.create_db(genomes, emailAddress, JVMmemory)

            if self.has_genome(databaseId):
                genomeToUse = databaseId

        if not genomeToUse:
            raise Exception()

        args = [
            '-treatAllAsProteinCoding', 'false', '-t', '-noLog', '-ud', '0', '-noStats', '-noShiftHgvs', genomeToUse,
            os.path.realpath(inVcf)
        ]

        command_ps = self.execute('ann', args, JVMmemory=JVMmemory)
        if command_ps.returncode == 0:
            with open(tmpVcf, 'wt') as outf:
               outf.write(command_ps.stdout.decode("utf-8"))

            if outVcf.endswith('.vcf.gz'):
                pysam.tabix_compress(tmpVcf, outVcf, force=True)
                pysam.tabix_index(outVcf, force=True, preset='vcf')
                os.unlink(tmpVcf)
        else:
            raise subprocess.CalledProcessError(cmd=command_ps.args, returncode=command_ps.returncode, output=command_ps.stdout)
Example #26
0
    def annotate_vcf(self, inVcf, genomes, outVcf, emailAddress, JVMmemory=None):
        """
        Annotate variants in VCF file with translation consequences using snpEff.
        """
        if outVcf.endswith('.vcf.gz'):
            tmpVcf = util.file.mkstempfname(prefix='vcf_snpEff-', suffix='.vcf')
        elif outVcf.endswith('.vcf'):
            tmpVcf = outVcf
        else:
            raise Exception("invalid input")

        sortedAccessionString = ", ".join([util.genbank.parse_accession_str(acc) for acc in sorted(genomes)])
        databaseId = hashlib.sha256(sortedAccessionString.encode('utf-8')).hexdigest()[:55]

        genomeToUse = ""

        # if we don't have the genome, by name (snpEff official) or by hash (custom)
        if (not self.has_genome(databaseId)):
            if (not self.has_genome(genomes[0])):
                _log.info("Checking for snpEff database online...")
                # check to see if it is available for download, and if so install it
                for row in self.available_databases():
                    if (genomes[0].lower() in row['Genome'].lower()) or (
                        genomes[0].lower() in row['Bundle'].lower()
                    ) or (
                        genomes[0].lower() in row['Organism'].lower()
                    ):
                        self.download_db(row['Genome'])

        # backward compatability for where a single genome name is provided
        if self.has_genome(genomes[0]):
            genomeToUse = genomes[0]
        else:
            # if the hash of the accessions passed in is not present in the genomes db
            if not self.has_genome(databaseId):
                self.create_db(genomes, emailAddress, JVMmemory)

            if self.has_genome(databaseId):
                genomeToUse = databaseId

        if not genomeToUse:
            raise Exception()

        args = [
            '-treatAllAsProteinCoding', 'false', '-t', '-noLog', '-ud', '0', '-noStats', '-noShiftHgvs', genomeToUse,
            os.path.realpath(inVcf)
        ]

        command_ps = self.execute('ann', args, JVMmemory=JVMmemory)
        if command_ps.returncode == 0:
            with open(tmpVcf, 'wt') as outf:
               outf.write(command_ps.stdout.decode("utf-8"))

            if outVcf.endswith('.vcf.gz'):
                pysam.tabix_compress(tmpVcf, outVcf, force=True)
                pysam.tabix_index(outVcf, force=True, preset='vcf')
                os.unlink(tmpVcf)
        else:
            raise subprocess.CalledProcessError(cmd=command_ps.args, returncode=command_ps.returncode, output=command_ps.stdout)
Example #27
0
    def testEmptyFileVCFGZWithIndex(self):
        with get_temp_context("tmp_testEmptyFile.vcf") as fn:
            with open(fn, "w"):
                pass
            # tabix_index will automatically compress
            pysam.tabix_index(fn, preset="vcf", force=True)

            self.assertRaises(ValueError, pysam.VariantFile, fn + ".gz")
Example #28
0
    def setUp( self ):
        
        self.tmpfilename = "tmp_%s.vcf" % id(self)
        shutil.copyfile( self.filename, self.tmpfilename )
        pysam.tabix_index( self.tmpfilename, preset = "vcf" )

        self.tabix = pysam.Tabixfile( self.tmpfilename + ".gz" )
        self.compare = [ x[:-1].split("\t") for x in open( self.filename, "r") if not x.startswith("#") ]
Example #29
0
    def test_indexing_to_custom_location_works(self):
        '''test indexing a file with a non-default location.'''

        index_path = get_temp_filename(suffix='custom.tbi')
        pysam.tabix_index(self.tmpfilename, preset="gff",
                          index=index_path, force=True)
        self.assertTrue(checkBinaryEqual(index_path, self.filename_idx))
        os.unlink(index_path)
Example #30
0
    def testIndexPresetUncompressed(self):
        '''test indexing via preset.'''

        pysam.tabix_index(self.tmpfilename, preset=self.preset)
        # check if uncompressed file has been removed
        self.assertEqual(os.path.exists(self.tmpfilename), False)
        checkBinaryEqual(self.tmpfilename + ".gz", self.filename)
        checkBinaryEqual(self.tmpfilename + ".gz.tbi", self.filename_idx)
Example #31
0
    def testIndexPresetUncompressed(self):
        '''test indexing via preset.'''

        pysam.tabix_index(self.tmpfilename, preset=self.preset)
        # check if uncompressed file has been removed
        self.assertEqual(os.path.exists(self.tmpfilename), False)
        checkBinaryEqual(self.tmpfilename + ".gz", self.filename)
        checkBinaryEqual(self.tmpfilename + ".gz.tbi", self.filename_idx)
Example #32
0
    def indexingVariantFile(self, varFile):
        """Index variant file with Tabix"""

        logging.info('Trying to index file: {}'.format(varFile))
        pysam.tabix_index(varFile, force=True, preset="vcf")
        if not self.isVariantFileIndexed(varFile):
            raise Exception("Can not index file: {}".format(varFile))
        return True
Example #33
0
def get_refseq_allele(vcf_file, fasta_file, out_file, verbose=False):
    """
    Output a reference VCF file that contains the ref/alt allele coding 
    with the ref allele matching the reference sequence. This assumes that 
    all alleles are in the forward strand. If neither the ref or alt 
    allele is not found in the reference sequence, the alleles are output 
    as missing.
    
    Attributes
    ----------
    vcf_file : str
        VCF/BCF file name.
    fasta_file : str
        Fasta file name. Must be unzipped and have an fai index.
    out_file : str
        Output VCF file name.
    """

    rdr = fasta_fai.Reader(fasta_file)
    vcf_in = pysam.VariantFile(vcf_file, mode='r')

    vcf_out = pysam.VariantFile(out_file, mode='w')
    for r in vcf_in.header.records:
        vcf_out.header.add_record(r)

    counter = 0
    for rec in vcf_in.fetch():
        counter = counter + 1
        if verbose and counter % 1000 == 0:
            print(counter, "records")
        rec_out = vcf_out.new_record()
        rec_out.id = rec.id
        rec_out.pos = rec.pos
        rec_out.chrom = rec.chrom
        o = list(range(len(rec.alleles)))
        orv = list(reversed(o))
        ref_i = 0
        for i in orv:
            a = rec.alleles[i]
            refseq_base = rdr.get_seq(rec.chrom, rec.pos - 1, len(a))
            if a == refseq_base:
                ref_i = i
        o[ref_i] = 0
        o[0] = ref_i
        alleles = list()
        for i in o:
            alleles.append(rec.alleles[i])
        if (len(alleles) == 1):
            alleles.append('.')
        rec_out.alleles = tuple(alleles)
        alleles_set = set(rec.alleles)
        vcf_out.write(rec_out)
    rdr.close()
    vcf_in.close()
    vcf_out.close()
    if out_file[-3:] == ".gz":
        pysam.tabix_index(out_file, preset="vcf", force=True)
Example #34
0
 def test_indexing_with_lineskipping_works(self):
     '''test indexing via preset and lineskip.'''
     pysam.tabix_index(self.tmpfilename,
                       seq_col=0,
                       start_col=3,
                       end_col=4,
                       line_skip=1,
                       zerobased=False)
     self.assertFalse(checkBinaryEqual(self.tmpfilename + ".tbi", self.filename_idx))
Example #35
0
 def test_vcf_with_tbi_index(self):
     with get_temp_context("tmp_fn.vcf") as fn:
         shutil.copyfile(self.vcf_filename, fn)
         pysam.tabix_index(fn, preset="vcf", force=True)
         self.assertTrue(os.path.exists(fn + ".gz" + ".tbi"))
         self.assertFalse(os.path.exists(fn + ".gz" + ".csi"))
         
         with pysam.VariantFile(fn + ".gz") as inf:
             self.assertEqual(len(list(inf.fetch("20"))), 3)
Example #36
0
def indexFile(input_file):
    sys.stdout.write('Compressing file... ')
    sys.stdout.flush()
    pysam.tabix_compress(input_file, input_file + '.gz', force=True)
    sys.stdout.write('OK\n')
    sys.stdout.write('Indexing output file... ')
    sys.stdout.flush()
    pysam.tabix_index(input_file + '.gz', seq_col=4, start_col=6, end_col=7, meta_char='#', force=True)
    sys.stdout.write('OK\n')
Example #37
0
    def test_vcf_with_tbi_index(self):
        with get_temp_context("tmp_fn.vcf") as fn:
            shutil.copyfile(self.vcf_filename, fn)
            pysam.tabix_index(fn, preset="vcf", force=True)
            self.assertTrue(os.path.exists(fn + ".gz" + ".tbi"))
            self.assertFalse(os.path.exists(fn + ".gz" + ".csi"))

            with pysam.VariantFile(fn + ".gz") as inf:
                self.assertEqual(len(list(inf.fetch("20"))), 3)
Example #38
0
def run_nfr(args):
    """run nfr calling

    """
    if args.bam is None and args.ins_track is None:
        raise Exception("Must supply either bam file or insertion track")
    if not args.out:
        args.out = '.'.join(os.path.basename(args.calls).split('.')[0:-3])
    if args.fasta is not None:
        chrs_fasta = read_chrom_sizes_from_fasta(args.fasta)
        pwm = PWM.open(args.pwm)
        chunks = ChunkList.read(args.bed, chromDict = chrs_fasta, min_offset = max(pwm.up, pwm.down))
    else:
        chunks = ChunkList.read(args.bed)
    if args.bam is not None:
        chrs_bam = read_chrom_sizes_from_bam(args.bam)
        chunks.checkChroms(chrs_bam, chrom_source = "BAM file") 
    chunks.merge()
    maxQueueSize = args.cores * 10 
    params = NFRParameters(args.occ_track, args.calls, args.ins_track, args.bam, max_occ = args.max_occ, max_occ_upper = args.max_occ_upper,
                            fasta = args.fasta, pwm = args.pwm)
    sets = chunks.split(items = args.cores * 5)
    pool1 = mp.Pool(processes = max(1,args.cores-1))
    nfr_handle = open(args.out + '.nfrpos.bed','w')
    nfr_handle.close()
    nfr_queue = mp.JoinableQueue()
    nfr_process = mp.Process(target = _writeNFR, args=(nfr_queue, args.out))
    nfr_process.start()
    if params.ins_track is None:
        ins_handle = open(args.out + '.ins.bedgraph','w')
        ins_handle.close()
        ins_queue = mp.JoinableQueue()
        ins_process = mp.Process(target = _writeIns, args=(ins_queue, args.out))
        ins_process.start()
    for j in sets:
        tmp = pool1.map(_nfrHelper, zip(j,itertools.repeat(params)))
        for result in tmp:
            if params.ins_track is None:
                nfr_queue.put(result[0])
                ins_queue.put(result[1])
            else:
                nfr_queue.put(result)
    pool1.close()
    pool1.join()
    nfr_queue.put('STOP')
    nfr_process.join()
    if params.ins_track is None:
        ins_queue.put('STOP')
        ins_process.join()
    pysam.tabix_compress(args.out + '.nfrpos.bed', args.out + '.nfrpos.bed.gz',force = True)
    shell_command('rm ' + args.out + '.nfrpos.bed')
    pysam.tabix_index(args.out + '.nfrpos.bed.gz', preset = "bed", force = True)
    if params.ins_track is None:
        pysam.tabix_compress(args.out + '.ins.bedgraph', args.out + '.ins.bedgraph.gz', force = True)
        shell_command('rm ' + args.out + '.ins.bedgraph')
        pysam.tabix_index(args.out + '.ins.bedgraph.gz', preset = "bed", force = True)
Example #39
0
 def test_indexing_with_lineskipping_works(self):
     '''test indexing via preset and lineskip.'''
     pysam.tabix_index(self.tmpfilename,
                       seq_col=0,
                       start_col=3,
                       end_col=4,
                       line_skip=1,
                       zerobased=False)
     self.assertFalse(checkBinaryEqual(
         self.tmpfilename + ".tbi", self.filename_idx))
def make_non_somatic_panel(file_lst, panelname, genome, cosmic_db, cnt):

    indel_lst = filter_indels(file_lst, genome, cosmic_db, cnt)
    vcf_data = to_vcf_data(indel_lst)

    with open(panelname, "w") as f:
        f.write(vcf_header() + "\n")
        f.write("\n".join(vcf_data))

    pysam.tabix_index(panelname, preset="vcf")
Example #41
0
    def test_indexing_with_explict_columns_works(self):
        '''test indexing via preset.'''

        pysam.tabix_index(self.tmpfilename,
                          seq_col=0,
                          start_col=3,
                          end_col=4,
                          line_skip=0,
                          zerobased=False)
        self.assertTrue(checkBinaryEqual(self.tmpfilename + ".tbi", self.filename_idx))
Example #42
0
    def testEmptyFileVCFGZWithIndex(self):
        with get_temp_context("tmp_testEmptyFile.vcf") as fn:
            with open(fn, "w"):
                pass
            # tabix_index will automatically compress
            pysam.tabix_index(fn,
                              preset="vcf",
                              force=True)

            self.assertRaises(ValueError, pysam.VariantFile, fn + ".gz")
Example #43
0
def indexFile(f, options):
    sys.stdout.write(f'Compressing output file {f}... ')
    sys.stdout.flush()
    pysam.tabix_compress(os.path.join(options.output_dir, f), os.path.join(options.output_dir, f + '.gz'), force=True)
    sys.stdout.write('OK\n')
    sys.stdout.write(f'Indexing output file {f}... ')
    sys.stdout.flush()
    pysam.tabix_index(os.path.join(options.output_dir, f + '.gz'), seq_col=4, start_col=6, end_col=7, meta_char='#',
                      force=True)
    sys.stdout.write('OK\n')
Example #44
0
    def test_indexing_to_custom_location_works(self):
        '''test indexing a file with a non-default location.'''

        index_path = get_temp_filename(suffix='custom.tbi')
        pysam.tabix_index(self.tmpfilename,
                          preset="gff",
                          index=index_path,
                          force=True)
        self.assertTrue(checkGZBinaryEqual(index_path, self.filename_idx))
        os.unlink(index_path)
Example #45
0
    def _index_with_tabix(self):
        """Compress and index output file by Tabix"""

        pysam.tabix_compress(self._fn + '_tmp', self._fn + '.gz', force=True)
        pysam.tabix_index(self._fn + '.gz',
                          seq_col=self.idx_chrom,
                          start_col=self.idx_start,
                          end_col=self.idx_end,
                          meta_char='#',
                          force=True)
Example #46
0
def process_vcf(archive, vcf, vcf_index, output_prefix):
    """
    Extracts and processes the caveman vcf file.
    """
    out_raw_vcf = '{0}.tmp.vcf.gz'.format(output_prefix)
    logger.info("Extracting raw vcf to tmp file {0}".format(out_raw_vcf))
    extract_file(archive, vcf, out_raw_vcf)

    out_raw_vcf_index = '{0}.tmp.vcf.gz.tbi'.format(output_prefix)
    logger.info(
        "Extracting raw vcf index to tmp file {0}".format(out_raw_vcf_index))
    extract_file(archive, vcf_index, out_raw_vcf_index)

    # Update the sample name using BGZFile which doesn't assert any VCF format
    logger.info("Processing raw VCF to change TUMOUR -> TUMOR...")
    out_formatted_vcf = '{0}.vcf.gz'.format(output_prefix)
    logger.info("Creating final vcf {0}".format(out_formatted_vcf))
    writer = pysam.BGZFile(out_formatted_vcf, mode='wb')
    reader = pysam.BGZFile(out_raw_vcf, mode='rb')
    try:
        for line in reader:
            line = line.decode('utf-8')
            if line.startswith('##'):
                if line.startswith('##SAMPLE=<ID=TUMOUR'):
                    new_line = line.replace('ID=TUMOUR', 'ID=TUMOR') + '\n'
                    writer.write(new_line.encode('utf-8'))
                else:
                    new_line = line + '\n'
                    writer.write(new_line.encode('utf-8'))
            elif line.startswith('#CHROM'):
                new_line = line.replace('TUMOUR', 'TUMOR') + '\n'
                writer.write(new_line.encode('utf-8'))
            else:
                # BINF-306: fix rare case of alt == ref in caveman vcf.
                cols = line.split('\t')
                if cols[3] == cols[4]:
                    logger.warn(
                        "Removing loci {0}:{1} where ref and alt alleles are same: {2} - {3}"
                        .format(cols[0], cols[1], cols[3], cols[4]))
                    continue
                new_line = line + '\n'
                writer.write(new_line.encode('utf-8'))
    finally:
        writer.close()
        reader.close()

    # tabix index
    logger.info("Creating final vcf index {0}".format(out_formatted_vcf +
                                                      '.tbi'))
    pysam.tabix_index(out_formatted_vcf, preset='vcf', force=True)

    # clean up
    logger.info("Cleaning up tmp files...")
    os.remove(out_raw_vcf)
    os.remove(out_raw_vcf_index)
Example #47
0
    def test_indexing_with_explict_columns_works(self):
        '''test indexing via preset.'''

        pysam.tabix_index(self.tmpfilename,
                          seq_col=0,
                          start_col=3,
                          end_col=4,
                          line_skip=0,
                          zerobased=False)
        self.assertTrue(checkBinaryEqual(
            self.tmpfilename + ".tbi", self.filename_idx))
Example #48
0
def convert_VariantFile_to_IndexedVariantFile(vf_path, ivf_path):
    make_basedir(ivf_path)
    tmp_path = get_tmp_path(ivf_path)
    pysam.tabix_compress(vf_path, tmp_path, force=True)
    os.rename(tmp_path, ivf_path)

    pysam.tabix_index(
        filename=ivf_path, force=True,
        seq_col=0, start_col=1, end_col=1, # note: `pysam.tabix_index` calls the first column `0`, but cmdline `tabix` calls it `1`.
        line_skip=1, # skip header
    )
Example #49
0
def run_merge(args):
    if not args.out:
        args.out = '.'.join(os.path.basename(args.nucpos).split('.')[0:-3])
    occ = NucList.read(args.occpeaks, "occ", args.min_occ)
    nuc = NucList.read(args.nucpos, "nuc", args.min_occ)
    new = merge(occ, nuc, args.sep)
    out = open(args.out + '.nucmap_combined.bed','w')
    out.write(new.asBed())
    out.close()
    pysam.tabix_compress(args.out + '.nucmap_combined.bed', args.out + '.nucmap_combined.bed.gz',force = True)
    shell_command('rm ' + args.out + '.nucmap_combined.bed')
    pysam.tabix_index(args.out + '.nucmap_combined.bed.gz', preset = "bed", force = True)
Example #50
0
    def testEmptyFileVCFGZWithIndex(self):
        with open("tmp_testEmptyFile.vcf", "w"):
            pass

        pysam.tabix_index("tmp_testEmptyFile.vcf",
                          preset="vcf",
                          force=True)

        self.assertRaises(ValueError, pysam.VariantFile,
                          "tmp_testEmptyFile.vcf.gz")

        os.unlink("tmp_testEmptyFile.vcf.gz")
        os.unlink("tmp_testEmptyFile.vcf.gz.tbi")
Example #51
0
    def index(destDir, inputFilename, fileColumnNumList=None, preset=None):
        """
        Create a tabix index file for genomic position datasource tsv files.
        Prerequisites (for genomic position indexed):
            Input file has three columns that can be mapped to chromosome, start position, and end position without any modification.
                For example, ['hg19.oreganno.chrom', 'hg19.oreganno.chromStart', 'hg19.oreganno.chromEnd'] in oreganno.hg19.txt

        This will overwrite an existing index (since the force parameter is set to True in pysam.tabix_index() call).
        Also, in cases where the inputFilename doesn't end with a ".gz", the a compressed file will be created and indexed.

        :param destDir: destination directory
        :param ds_foldername: destination folder name
        :param fileColumnNumList: ordered list.  This list contains the corresponding entries (column numbers)
            in the tsv file. Typically, this would be [chr,start,end]  or [gene, startAA, endAA]
        :param inputFilename: tsv file input
        :param preset: if preset is provided, the column coordinates are taken from a preset. Valid values for preset
        are "gff", "bed", "sam", "vcf", "psltbl", and "pileup".
        """
        fileColumnNumList = [] if fileColumnNumList is None else fileColumnNumList
        inputFilename = os.path.abspath(inputFilename)
        fileDir = os.path.dirname(inputFilename)
        fileName, fileExtension = os.path.splitext(os.path.basename(inputFilename))

        if fileExtension in (".gz",):
            # Ensure .gz.tbi file is there as well
            inputIndexFilename = os.path.join(fileDir, string.join([inputFilename, "tbi"], "."))
            if not os.path.exists(inputIndexFilename):
                msg = "Missing tabix index file %s." % inputIndexFilename
                raise TabixIndexerFileMissingError(msg)

            outputFilename = os.path.join(destDir, string.join([fileName, "gz"], "."))
            shutil.copyfile(inputFilename, outputFilename)

            outputIndexFilename = os.path.join(destDir, string.join([fileName, "gz", "tbi"], "."))
            shutil.copyfile(inputIndexFilename, outputIndexFilename)

            return outputFilename

        outputFilename = os.path.join(destDir, string.join([fileName, ".tabix_indexed", fileExtension], ""))
        # Copy the input file to output file.
        shutil.copyfile(inputFilename, outputFilename)

        # Load the file into a tsvReader.
        if preset in ("gff", "bed", "sam", "vcf", "psltbl", "pileup"):
            tabix_index = pysam.tabix_index(filename=outputFilename, force=True, preset=preset)
        else:
            # Have to specify min_size=0 in pysam 0.8.1 to get pysam to correctly output a .tbi file 
            tabix_index = pysam.tabix_index(filename=outputFilename, force=True, seq_col=fileColumnNumList[0],
                                            start_col=fileColumnNumList[1], end_col=fileColumnNumList[2])

        return tabix_index
Example #52
0
def run(args):
    fin=IO.fopen(args.input,"r")
    outfile=args.input
    if not args.sorted:
        l = [ i for i in TableIO.parse(fin,args.format) ]
        l.sort()
        name=splitext(args.input)
        outfile = "{name[0]}.sorted{name[1]}".format(name=name)
        out = IO.fopen(outfile,"w")
        for i in l:
            print(i,file=out)
        out.close()
    format=args.format.translate(None,digits)
    tabix_index(outfile,preset=format)
def to_tabix(bgzip_fname, out_fname, preset=None, chrom_col=None, start_col=None, end_col=None):
    # Create index.
    if preset:
        # Preset type.
        bgzip_fname = pysam.tabix_index(filename=bgzip_fname, preset=preset, keep_original=True,
                                        index=out_fname, force=True)
    else:
        # For interval files; column indices are 0-based.
        bgzip_fname = pysam.tabix_index(filename=bgzip_fname, seq_col=(chrom_col - 1),
                                        start_col=(start_col - 1), end_col=(end_col - 1),
                                        keep_original=True, index=out_fname, force=True)
    if os.path.getsize(out_fname) == 0:
        sys.stderr.write("The converted tabix index file is empty, meaning the input data is invalid.")
    return bgzip_fname
Example #54
0
def merge_vcfs(in_vcfs_dir, contigs, out_vcf):
    logger.info("Mergings per-chromosome VCFs from %s" % in_vcfs_dir)
    header_done = False
    out_vcf_file = open(out_vcf, "w")
    for contig in contigs:
        chr_vcf = os.path.join(in_vcfs_dir, "%s.vcf.gz" % contig.name)
        if os.path.isfile(chr_vcf):
            chr_tabix_file = pysam.Tabixfile(chr_vcf)
            if not header_done:
                print_header(chr_tabix_file.header, out_vcf_file)
            for entry in chr_tabix_file.fetch():
                out_vcf_file.write("%s\n" % entry)
            chr_tabix_file.close()
    out_vcf_file.close()
    pysam.tabix_index(out_vcf, force=True, preset="vcf")
Example #55
0
def indexFile(options):
    filename=options.output
    if not options.ensembl is None:
        sys.stdout.write('Compressing output file... ') 
        sys.stdout.flush()
        pysam.tabix_compress(filename,filename+'.gz',force=True)
        sys.stdout.write('OK\n') 	
        sys.stdout.write('Indexing output file... ') 
        sys.stdout.flush()
        pysam.tabix_index(filename+'.gz', seq_col=2, start_col=4, end_col=5, meta_char='#',force=True)
        sys.stdout.write('OK\n')
    else:
        print 'Compressing file...'
        pysam.tabix_compress(filename,filename+'.gz',force=True)
        print 'Indexing file...'
        pysam.tabix_index(filename+'.gz', seq_col=1, start_col=2, end_col=2, meta_char='#',force=True)
Example #56
0
File: gtf.py Project: soh-i/Ivy
 def _prepare(self):
     if not os.path.isfile(self.ingtf + ".gz.tbi"):
         print "Generate indexed GTF (tabix) file: '{0}'...".format(self.ingtf)
         compressed_gtf = pysam.tabix_index(self.ingtf, preset="gff")
     else:
         compressed_gtf = self.ingtf + ".gz"
     self.tabixfile = pysam.Tabixfile(compressed_gtf)
Example #57
0
 def addVariantSet(self, variantFileName, dataset, referenceSet, ontology):
     inputVcf = os.path.join(
         self.inputDirectory, variantFileName)
     outputVcf = os.path.join(
         self.outputDirectory, variantFileName)
     shutil.copy(inputVcf, outputVcf)
     pysam.tabix_index(outputVcf, preset="vcf")
     variantSet = variants.HtslibVariantSet(
         dataset, variantFileName.split('_')[1])
     variantSet.setReferenceSet(referenceSet)
     variantSet.populateFromFile(
         [outputVcf + ".gz"], [outputVcf + ".gz.tbi"])
     variantSet.checkConsistency()
     self.repo.insertVariantSet(variantSet)
     for annotationSet in variantSet.getVariantAnnotationSets():
         annotationSet.setOntology(ontology)
         self.repo.insertVariantAnnotationSet(annotationSet)
Example #58
0
def bgzip_index(original_file, new_file, file_format):
    """

    :param original_file:
    :param new_file:
    :param file_format:
    :return:
    """

    if file_format.lower() == 'fa':
        tabix_compress(original_file, new_file)
        faidx(new_file)
        delete_file(original_file)
    elif file_format.lower() == 'vcf':
        tabix_index(original_file, preset="vcf", force=True)
    else:
        raise G2GValueError("Unknown file format: {0}".format(file_format))
Example #59
0
 def __tabix(self, file_name):
     """ tabix into gz and tbi file """
     return pysam.tabix_index(file_name,
                              force     = True,
                              seq_col   = combivep_settings.LJB_PARSED_0_INDEX_CHROM,
                              start_col = combivep_settings.LJB_PARSED_0_INDEX_POS,
                              end_col   = combivep_settings.LJB_PARSED_0_INDEX_POS,
                              zerobased = False)
Example #60
0
def getTabixMod(filename):
    '''Unzip a mod file, use bgzip to rezip it, and and build tabix index.'''
    logger = logging.getLogger('tmpmod') 
    logger.info('extracting MOD file ...')   
    modfp = gzip.open(filename, 'rb')
    tmpName = tempfile.mkstemp('.tsv')[1]    
    tmpfp = open(tmpName, 'wb')
    tmpfp.writelines(modfp)
    tmpfp.close()
    modfp.close()    
    pysam.tabix_index(tmpName, force=True, seq_col=1, start_col=2, end_col=2, 
                      meta_char='#', zerobased=True)
    tmpName += '.gz'
    logger.info('temporary file %s created', tmpName)
    return tmpName

#print(getTabixMod("../data/B.mod"))