Beispiel #1
0
 def __init__(self,
              outFile,
              header_prefix='#',
              col_chr=1,
              col_start=2,
              col_stop=2,
              preset=None,
              zerobased=False,
              tabixPath=None):
     assert outFile.endswith('.gz')
     assert not os.access(outFile, os.F_OK) or os.access(outFile, os.W_OK)
     self.outFile = outFile
     root_fname = outFile.split('/')[-1].split('.')[0]
     self.tmpFile = util_files.mkstempfname(prefix='tabix-%s-' % root_fname,
                                            suffix='.txt')
     self.tmp_f = open(self.tmpFile, 'wt')
     self.params = {
         'seq_col': col_chr,
         'start_col': col_start,
         'end_col': col_stop,
         'preset': preset,
         'zerobased': zerobased,
         'meta_char': header_prefix,
         'line_skip': 0
     }
     self.header = None
     self.n_header = 0
     self.n_rows = 0
     self.tabixPath = tabixPath
     self.last_chr_start = [None, None]
     self.seen_chrs = set()
Beispiel #2
0
def vcf_bgzip_index(inVcf, outVcf, tabixPath=None, vcftoolsPath=None):
	assert (inVcf.endswith('.vcf') or inVcf.endswith('.vcf.gz')) and outVcf.endswith('.vcf.gz')
	''' Compress (bgzip) and index (tabix and/or vcftools) a VCF file.'''
	
	if inVcf.endswith('.vcf'):
		log.info("compressing output to %s" % outVcf)
		cmdline = "%s/bgzip -c %s > %s" % (tabixPath, inVcf, outVcf)
		assert not os.system(cmdline)
	elif inVcf==outVcf:
		log.info("leaving %s, already compressed" % inVcf)
	else:
		log.info("copying compressed vcf to %s" % outVcf)
		shutil.copy(inVcf, outVcf)
	
	log.info("indexing with tabix")
	cmdline = "%s/tabix %s -f -p vcf" % (tabixPath, outVcf)
	assert not os.system(cmdline)
	
	if vcftoolsPath:
		log.info("indexing with vcftools")
		tmpFile = util_files.mkstempfname(prefix='vcftools-log-', suffix='.vcf')
		cmdline = "%s/vcftools --gzvcf %s --out %s --force-index-write" % (vcftoolsPath, outVcf, tmpFile)
		assert not os.system(cmdline)
		os.unlink(tmpFile)
		os.unlink(tmpFile+'.log')
	return outVcf
Beispiel #3
0
def vcf_bgzip_index(inVcf, outVcf, tabixPath=None, vcftoolsPath=None):
    assert (inVcf.endswith('.vcf')
            or inVcf.endswith('.vcf.gz')) and outVcf.endswith('.vcf.gz')
    ''' Compress (bgzip) and index (tabix and/or vcftools) a VCF file.'''

    if inVcf.endswith('.vcf'):
        log.info("compressing output to %s" % outVcf)
        cmdline = "%s/bgzip -c %s > %s" % (tabixPath, inVcf, outVcf)
        assert not os.system(cmdline)
    elif inVcf == outVcf:
        log.info("leaving %s, already compressed" % inVcf)
    else:
        log.info("copying compressed vcf to %s" % outVcf)
        shutil.copy(inVcf, outVcf)

    log.info("indexing with tabix")
    cmdline = "%s/tabix %s -f -p vcf" % (tabixPath, outVcf)
    assert not os.system(cmdline)

    if vcftoolsPath:
        log.info("indexing with vcftools")
        tmpFile = util_files.mkstempfname(prefix='vcftools-log-',
                                          suffix='.vcf')
        cmdline = "%s/vcftools --gzvcf %s --out %s --force-index-write" % (
            vcftoolsPath, outVcf, tmpFile)
        assert not os.system(cmdline)
        os.unlink(tmpFile)
        os.unlink(tmpFile + '.log')
    return outVcf
Beispiel #4
0
    def __init__(self,
                 inVcf,
                 enforce_unique_pos=True,
                 sample=None,
                 interval=None):
        assert inVcf.endswith('.vcf.gz')
        clens = get_chrlens(inVcf)
        self.clens = dict(clens)
        self.contigs = [c for c, l in clens]
        self.sample_names = vcf_sample_names(inVcf)
        root_fname = inVcf[:-7].split('/')[-1]
        self.dbFile = util_files.mkstempfname(prefix='%s-' % root_fname,
                                              suffix='.db')
        self.conn = sqlite3.connect(self.dbFile, isolation_level='DEFERRED')
        self.cur = self.conn.cursor()
        self.cur.execute("""create table cons (
			chr string not null,
			pos integer not null,
			allele string not null)""")
        self.cur.execute("create %s index cons_idx on cons(chr,pos)" %
                         (enforce_unique_pos and "unique" or ""))
        self.conn.commit()
        self.cur.executemany(
            "insert into cons (chr,pos,allele) values (?,?,?)",
            [(c, p, alleles[genos[0][1]]) for c, p, alleles, genos in
             vcf_haploid_iterator(inVcf,
                                  sample_list=sample and [sample] or None,
                                  interval=interval)])
        self.conn.commit()
Beispiel #5
0
	def __init__(self, dbFile=None):
		if dbFile==None:
			dbFile = util_files.mkstempfname(suffix='.db')
		self.conn = sqlite3.connect(dbFile, isolation_level='DEFERRED')
		assert self.conn.isolation_level
		self.cur = self.conn.cursor()
		self.cur.execute("PRAGMA foreign_keys=ON")
		self.cur.execute("PRAGMA foreign_keys")
		fk = self.cur.fetchone()
		log.debug("SQLite version: %s" % sqlite3.sqlite_version)
		log.debug("SQLite foreign key support: %s" % ((fk and fk[0]) and 'true' or 'false'))
		self.start()
Beispiel #6
0
    def __init__(self,
                 inVcf,
                 enforce_unique_pos=True,
                 sample_list=None,
                 drop_indels=False,
                 drop_monomorphic=False,
                 drop_multiallelic=False,
                 interval=None):
        assert inVcf.endswith('.vcf.gz')
        clens = get_chrlens(inVcf)
        self.clens = dict(clens)
        self.contigs = [c for c, l in clens]
        self.sample_names = vcf_sample_names(inVcf)
        root_fname = inVcf[:-7].split('/')[-1]
        self.dbFile = util_files.mkstempfname(prefix='%s-' % root_fname,
                                              suffix='.db')
        self.conn = sqlite3.connect(self.dbFile, isolation_level='DEFERRED')
        self.cur = self.conn.cursor()
        self.cur.execute("""create table snp (
			chr string not null,
			pos integer not null,
			alleles string not null)""")
        self.cur.execute("""create table geno (
			chr string not null,
			pos integer not null,
			sample string not null,
			allele integer not null)""")
        self.cur.execute("create %s index snp_idx on snp(chr,pos)" %
                         (enforce_unique_pos and "unique" or ""))
        self.cur.execute("create %s index geno_idx on geno(chr,pos,sample)" %
                         (enforce_unique_pos and "unique" or ""))
        self.conn.commit()
        self.cur.executemany(
            "insert into snp (chr,pos,alleles) values (?,?,?)",
            [(c, p, ','.join(alleles)) for c, p, alleles, genos in
             vcf_haploid_iterator(inVcf,
                                  sample_list=sample_list,
                                  drop_indels=drop_indels,
                                  drop_monomorphic=drop_monomorphic,
                                  drop_multiallelic=drop_multiallelic,
                                  interval=interval)])
        self.conn.commit()
        self.cur.executemany(
            "insert into geno (chr,pos,sample,allele) values (?,?,?,?)",
            self._geno_iterator(
                vcf_haploid_iterator(inVcf,
                                     sample_list=sample_list,
                                     drop_indels=drop_indels,
                                     drop_monomorphic=drop_monomorphic,
                                     drop_multiallelic=drop_multiallelic,
                                     interval=interval)))
        self.conn.commit()
Beispiel #7
0
def get_chroms(inVcf, tabixPath=None):
	''' Get a list of unique chromosomes for this genome, in sort order.
		(Use tabix to do it quickly)
	'''
	tmpFile = util_files.mkstempfname(prefix='chrnames-', suffix='.txt')
	cmdline = "%s/tabix -l %s > %s" % (tabixPath, inVcf, tmpFile)
	assert not os.system(cmdline)
	chroms = []
	with open(tmpFile, 'rt') as inf:
		for line in inf:
			chroms.append(line.rstrip('\r\n'))
	os.unlink(tmpFile)
	return chroms
Beispiel #8
0
 def __init__(self, dbFile=None):
     if dbFile == None:
         dbFile = util_files.mkstempfname(suffix='.db')
     self.conn = sqlite3.connect(dbFile, isolation_level='DEFERRED')
     assert self.conn.isolation_level
     self.cur = self.conn.cursor()
     self.cur.execute("PRAGMA foreign_keys=ON")
     self.cur.execute("PRAGMA foreign_keys")
     fk = self.cur.fetchone()
     log.debug("SQLite version: %s" % sqlite3.sqlite_version)
     log.debug("SQLite foreign key support: %s" %
               ((fk and fk[0]) and 'true' or 'false'))
     self.start()
Beispiel #9
0
def get_chroms(inVcf, tabixPath=None):
    ''' Get a list of unique chromosomes for this genome, in sort order.
		(Use tabix to do it quickly)
	'''
    tmpFile = util_files.mkstempfname(prefix='chrnames-', suffix='.txt')
    cmdline = "%s/tabix -l %s > %s" % (tabixPath, inVcf, tmpFile)
    assert not os.system(cmdline)
    chroms = []
    with open(tmpFile, 'rt') as inf:
        for line in inf:
            chroms.append(line.rstrip('\r\n'))
    os.unlink(tmpFile)
    return chroms
Beispiel #10
0
def vcf_subset(inVcf, c, start_stop=None, outVcf=None, keepHeader=False, tabixPath=None):
	''' Pull just a piece of a VCF file into a new VCF file (create a temp
		file if outVcf is not specified).
	'''
	if outVcf==None:
		outVcf = util_files.mkstempfname(prefix='vcf_subset-%s-'%c, suffix='.vcf')
	assert inVcf.endswith('.vcf.gz') and outVcf.endswith('.vcf')
	cmdline = "%s/tabix" % tabixPath
	if keepHeader:
		cmdline += ' -h'
	cmdline += ' %s %s' % (inVcf, c)
	if start_stop:
		cmdline += ':%d-%d' % start_stop
	cmdline += ' > %s' % outVcf
	assert not os.system(cmdline)
	return outVcf
Beispiel #11
0
	def __init__(self, outFile, header_prefix='#',
			col_chr=1, col_start=2, col_stop=2, preset=None, zerobased=False,
			tabixPath=None):
		assert outFile.endswith('.gz')
		assert not os.access(outFile, os.F_OK) or os.access(outFile, os.W_OK)
		self.outFile = outFile
		root_fname = outFile.split('/')[-1].split('.')[0]
		self.tmpFile = util_files.mkstempfname(prefix='tabix-%s-'%root_fname,suffix='.txt')
		self.tmp_f = open(self.tmpFile, 'wt')
		self.params = {'seq_col':col_chr, 'start_col':col_start, 'end_col':col_stop,
			'preset':preset, 'zerobased':zerobased, 'meta_char':header_prefix,
			'line_skip':0}
		self.header = None
		self.n_header = 0
		self.n_rows = 0
		self.tabixPath=tabixPath
		self.last_chr_start = [None,None]
		self.seen_chrs = set()
Beispiel #12
0
	def __init__(self, inVcf,
		enforce_unique_pos=True, sample_list=None,
		drop_indels=False, drop_monomorphic=False, drop_multiallelic=False,
		interval=None):
		assert inVcf.endswith('.vcf.gz')
		clens = get_chrlens(inVcf)
		self.clens = dict(clens)
		self.contigs = [c for c,l in clens]
		self.sample_names = vcf_sample_names(inVcf)
		root_fname = inVcf[:-7].split('/')[-1]
		self.dbFile = util_files.mkstempfname(prefix='%s-'%root_fname,suffix='.db')
		self.conn = sqlite3.connect(self.dbFile, isolation_level='DEFERRED')
		self.cur = self.conn.cursor()
		self.cur.execute("""create table snp (
			chr string not null,
			pos integer not null,
			alleles string not null)""")
		self.cur.execute("""create table geno (
			chr string not null,
			pos integer not null,
			sample string not null,
			allele integer not null)""")
		self.cur.execute("create %s index snp_idx on snp(chr,pos)" % (
			enforce_unique_pos and "unique" or ""))
		self.cur.execute("create %s index geno_idx on geno(chr,pos,sample)" % (
			enforce_unique_pos and "unique" or ""))
		self.conn.commit()
		self.cur.executemany("insert into snp (chr,pos,alleles) values (?,?,?)",
			[(c,p,','.join(alleles))
				for c,p,alleles,genos in vcf_haploid_iterator(inVcf,
					sample_list=sample_list,
					drop_indels=drop_indels, drop_monomorphic=drop_monomorphic,
					drop_multiallelic=drop_multiallelic,
					interval=interval)])
		self.conn.commit()
		self.cur.executemany("insert into geno (chr,pos,sample,allele) values (?,?,?,?)",
			self._geno_iterator(vcf_haploid_iterator(inVcf, sample_list=sample_list,
				drop_indels=drop_indels, drop_monomorphic=drop_monomorphic,
				drop_multiallelic=drop_multiallelic,
				interval=interval)))
		self.conn.commit()
Beispiel #13
0
	def __init__(self, snpEffVcf=None, snpIterator=None):
		self.snpIterator = snpIterator
		self.dbFile = util_files.mkstempfname(prefix='SnpAnnotater-', suffix='.db')
		self.conn = sqlite3.connect(self.dbFile, isolation_level='DEFERRED')
		self.cur = self.conn.cursor()
		self.cur.execute("""create table annot (
			chr not null,
			pos integer not null,
			allele_ref not null,
			allele_alt not null,
			effect not null,
			impact not null,
			gene_id,
			gene_name,
			protein_pos integer,
			residue_ref,
			residue_alt
		)""")
		self.cur.execute("create index idx_annot on annot(chr,pos)")
		if snpEffVcf:
			self.loadVcf(snpEffVcf)
Beispiel #14
0
    def __init__(self, snpEffVcf=None, snpIterator=None):
        self.snpIterator = snpIterator
        self.dbFile = util_files.mkstempfname(prefix='SnpAnnotater-',
                                              suffix='.db')
        self.conn = sqlite3.connect(self.dbFile, isolation_level='DEFERRED')
        self.cur = self.conn.cursor()
        self.cur.execute("""create table annot (
			chr not null,
			pos integer not null,
			allele_ref not null,
			allele_alt not null,
			effect not null,
			impact not null,
			gene_id,
			gene_name,
			protein_pos integer,
			residue_ref,
			residue_alt
		)""")
        self.cur.execute("create index idx_annot on annot(chr,pos)")
        if snpEffVcf:
            self.loadVcf(snpEffVcf)
Beispiel #15
0
def vcf_subset(inVcf,
               c,
               start_stop=None,
               outVcf=None,
               keepHeader=False,
               tabixPath=None):
    ''' Pull just a piece of a VCF file into a new VCF file (create a temp
		file if outVcf is not specified).
	'''
    if outVcf == None:
        outVcf = util_files.mkstempfname(prefix='vcf_subset-%s-' % c,
                                         suffix='.vcf')
    assert inVcf.endswith('.vcf.gz') and outVcf.endswith('.vcf')
    cmdline = "%s/tabix" % tabixPath
    if keepHeader:
        cmdline += ' -h'
    cmdline += ' %s %s' % (inVcf, c)
    if start_stop:
        cmdline += ':%d-%d' % start_stop
    cmdline += ' > %s' % outVcf
    assert not os.system(cmdline)
    return outVcf
Beispiel #16
0
	def __init__(self, inVcf,
		enforce_unique_pos=True, sample=None, interval=None):
		assert inVcf.endswith('.vcf.gz')
		clens = get_chrlens(inVcf)
		self.clens = dict(clens)
		self.contigs = [c for c,l in clens]
		self.sample_names = vcf_sample_names(inVcf)
		root_fname = inVcf[:-7].split('/')[-1]
		self.dbFile = util_files.mkstempfname(prefix='%s-'%root_fname,suffix='.db')
		self.conn = sqlite3.connect(self.dbFile, isolation_level='DEFERRED')
		self.cur = self.conn.cursor()
		self.cur.execute("""create table cons (
			chr string not null,
			pos integer not null,
			allele string not null)""")
		self.cur.execute("create %s index cons_idx on cons(chr,pos)" % (
			enforce_unique_pos and "unique" or ""))
		self.conn.commit()
		self.cur.executemany("insert into cons (chr,pos,allele) values (?,?,?)",
			[(c,p,alleles[genos[0][1]])
				for c,p,alleles,genos in vcf_haploid_iterator(inVcf,
					sample_list=sample and [sample] or None,
					interval=interval)])
		self.conn.commit()