def _makedb(self): """Internal method. Do not use""" self.logger.info('Creating SNP database from ...') self.logger.info('Input file: %s' % self.inname) if not os.path.exists(self.inname): self.logger.error('%s: No such file' % self.inname) self.logger.error('Database not created') sys.exit(1) self.load(db=self.outname) self.conn.text_factory = str self._vcf = VCFParser(self.inname) self._infokeys, self._schema = self._createschema() for s in self._schema: self.createtable(s, True) stmt = 'insert into snps values (' stmt += ','.join('?' * (5 + len(self._infonum))) + ')' self.curs = self.conn.cursor() cache = [] n = 0 next = self._vcf.__next__ while 1: try: cache.append(self._insertentry(next())) except StopIteration: break n += 1 if not n % 100000: self.curs.executemany(stmt, cache) self.logger.info('Processed %d entries' % n) cache = [] if cache: self.curs.executemany(stmt, cache) self.logger.info('Processed %s entries' % n) # Add version details fd = dt.fromtimestamp(os.path.getmtime( self.inname)).strftime('%Y-%m-%d') if 'dbsnp' in self.inname: version = 'v' + str(self._vcf.meta['dbSNP_BUILD_ID']) elif 'ESP6500' in self.inname: fn = os.path.splitext(os.path.split(self.inname)[1])[0] version = fn.split('.')[0] elif 'ExAC' in self.inname: fn = os.path.splitext(os.path.split(self.inname)[1])[0] version = fn.split('.')[1] else: version = "v%s_%s" % tuple(fd.split('-')[:2]) self.set_version(os.path.split(self.outname)[1], fd, version, n) self.conn.commit() self.curs.close() self.conn.close() self.logger.info('... SNP database created')
def get_varcnt(invcf): '''Computes number of exonic variants per gene Args: invcf(str): VARANT annotated VCF Returns: varcnt(dictionary): Returns exonic variant count per gene ''' varcnt = {} vcf = VCFParser(invcf) for rec in vcf: vcf.parseinfo(rec) ant = vp.parse(rec.info) prant = vp.prio_trans(ant) cache = [] for altid, antinfo in prant.items(): if altid != 'intergenic': genelist = antinfo.keys() for gene in genelist: txant = antinfo[gene]['TRANSCRIPT'] key = (rec.chrom, gene) if 'CodingExonic' in txant.region.split('_')\ and txant.mutation != 'Syn' \ and rec.info['ESPAF'] < 5.0 and key not in cache: #TODO (to be replaced by ExAC?) cache.append(key) if key not in varcnt: varcnt[key] = 1 else: varcnt[key] += 1 return varcnt
def _get_sample_ids(invcf): '''Return sample ids present in the vcf file''' vcfo = VCFParser(invcf) return vcfo.samples vcfo.close()
geneannot['TRANSCRIPT'] = [ta] if alltrans and \ geneannot['TRANSCRIPT'][0].region != 'Intergenic': for altid, geneinfo in parsed_dict.items(): if prior_gene in geneinfo: for nt in geneinfo[prior_gene]['TRANSCRIPTS']: if nt not in geneannot['TRANSCRIPT']: geneannot['TRANSCRIPT'].append(nt) return geneannot if __name__ == '__main__': import sys vcffile = sys.argv[1] vcf = VCFParser(vcffile) for rec in vcf: print rec.chrom, rec.pos, rec.ref, rec.alt, rec.id vcf.parseinfo(rec) print 'Parsed Varant Annotation - ' par_ant = parse(rec.info) for ac, val in par_ant.items(): print ac, '\t', val print 'Prioritized transcript per gene dictionary - ' pt_par_ant = prio_trans(par_ant) for ac, val in pt_par_ant.items(): print ac, '\t', val print 'Prioritized gene per vcf record - ' ga = get_prior_geneannot(rec.info, alltrans=False) print ga print '\n'
class SNPDB(db.DB): def _getschema(self): schema = [ """create table snps (chrom text not null, pos int not null, id text not null, ref text not null, alt text not null, """, """create index chromposindex on snps (chrom, pos)""", """create index snpid on snps (id)""" ] return schema def _makedb(self): """Internal method. Do not use""" self.logger.info('Creating SNP database from ...') self.logger.info('Input file: %s' % self.inname) if not os.path.exists(self.inname): self.logger.error('%s: No such file' % self.inname) self.logger.error('Database not created') sys.exit(1) self.load(db=self.outname) self.conn.text_factory = str self._vcf = VCFParser(self.inname) self._infokeys, self._schema = self._createschema() for s in self._schema: self.createtable(s, True) stmt = 'insert into snps values (' stmt += ','.join('?' * (5 + len(self._infonum))) + ')' self.curs = self.conn.cursor() cache = [] n = 0 next = self._vcf.__next__ while 1: try: cache.append(self._insertentry(next())) except StopIteration: break n += 1 if not n % 100000: self.curs.executemany(stmt, cache) self.logger.info('Processed %d entries' % n) cache = [] if cache: self.curs.executemany(stmt, cache) self.logger.info('Processed %s entries' % n) # Add version details fd = dt.fromtimestamp(os.path.getmtime( self.inname)).strftime('%Y-%m-%d') if 'dbsnp' in self.inname: version = 'v' + str(self._vcf.meta['dbSNP_BUILD_ID']) elif 'ESP6500' in self.inname: fn = os.path.splitext(os.path.split(self.inname)[1])[0] version = fn.split('.')[0] elif 'ExAC' in self.inname: fn = os.path.splitext(os.path.split(self.inname)[1])[0] version = fn.split('.')[1] else: version = "v%s_%s" % tuple(fd.split('-')[:2]) self.set_version(os.path.split(self.outname)[1], fd, version, n) self.conn.commit() self.curs.close() self.conn.close() self.logger.info('... SNP database created') def _createschema(self): """Internal method to create schema based on the INFO fields in the VCF file""" schema = self._getschema() info = self._vcf.meta['INFO'] keys = list(info.keys()) keys.sort() infocols = [] cursor = self.conn.cursor() cursor.execute(INFOTABLE) stmt = 'insert into info values (?,?,?,?)' self._infonum = {} for key in keys: knumb, ktype, kdesc = info[key] if knumb not in '01': ktype = 'text' self._infonum[key] = True else: ktype = TYPEMAP[RTYPEMAP.get(ktype, 'text')] self._infonum[key] = False cursor.execute(stmt, ('info_' + key, ktype, knumb, kdesc)) infocols.append('info_%s %s' % (key, ktype)) self.conn.commit() cursor.close() schema[0] += ','.join(infocols) schema[0] += ')' self.logger.info(schema) return keys, schema def _insertentry(self, rec): self._vcf.parseinfo(rec) args = [ rec.chrom, rec.pos, ';'.join(rec.id), rec.ref, ','.join(rec.alt) ] infn = self._infonum info = rec.info.get for key in self._infokeys: v = info(key, None) if v is not None: if infn[key]: v = ','.join(str(el) for el in v) args.append(v) return args
def _load(invcf, thres_af, nmethod, data=None, sc=1): if not data: data = {} vcfs = VCFParser(invcf) samples = vcfs.samples for rec in vcfs: vcfs.parseinfo(rec) vcfs.parsegenotypes(rec) if not _is_HQVar(rec.filter): # Checks variant is PASS continue for sid in samples: if sid not in data: data[sid] = {} gi = rec[sid] gt, gq = gi.GT, gi.GQ # Checks if genotype is not reference or GQ >= 30 if not _genotype_check(gt, gq): continue altid = int(gt.split('/')[1]) var = rec.chrom + ':' + str(rec.pos) + ':' + rec.ref +\ ':' + rec.alt[altid - 1] af, flag = isRare(altid, rec.info, thres_af) if not flag: # Checks if variant is not Rare (AF < 5%) in ExAC continue if 'LCR' in rec.info: continue if 'CLNDBN' in rec.info: dn = rec.info.CLNDBN[altid - 1] sig_num = rec.info.CLNSIG[altid - 1] if '|' in sig_num: sig_num = [int(e) for e in sig_num.split('|') if e != '.'] if sig_num: sig_num.sort() sig_num = sig_num[-1] cln_sig = CLNSIG_MAP[sig_num] else: cln_sig, dn = '', '' elif sig_num != '.': sig_num = int(sig_num) cln_sig = CLNSIG_MAP[sig_num] else: dn, cln_sig = '', '' else: dn, cln_sig = '', '' if 'LCR' in rec.info: lcr = 'LCR' else: lcr = '' if 'CADD_phred' in rec.info: val = rec.info['CADD_phred'][altid - 1] if val == '.': cadd = '' else: cadd = float(val) else: cadd = '' if len(rec.ref) == len(rec.alt[altid - 1]) and len(rec.ref) == 1: ada_score, rf_score = get_dbscSNV_ant(rec.chrom, rec.pos, rec.ref, rec.alt[altid - 1]) if (ada_score and ada_score > 0.6) or (rf_score and rf_score > 0.6): scpred = 'Damaging' else: scpred = '' else: ada_score, rf_score, scpred = '', '', '' sc_ant = [scpred, ada_score, rf_score] #Parse annotation and prioritize transcript pa = vp.prio_trans(vp.parse(rec.info)) # Ignore the intergenic variants if altid not in pa: continue eqtl_flag = False for gene, ant in pa[altid].items(): ta = ant['TRANSCRIPT'] key = ta.trans_id + '_' + ta.aa snps3d_pred = ['', '', '', ''] # SC-1 variant present in Clinvar as Pathogenic or Likely Pathogenic if sc == 1: # Search Criteria 1 if (cln_sig in ['Pathogenic', 'Likely pathogenic']): if gene not in data[sid]: data[sid][gene] = [] data[sid][gene].append( (ta, af, cadd, gt, eqtl_flag, var, dn, cln_sig, lcr, sc_ant, snps3d_pred)) # SC-2 variant is protein altering + SC-1 if sc == 2: # Search Criteria 2 if (_is_PASnv(ta) and _is_Damaging(altid, rec.info, ta, snps3d_pred, nmethod)) or _is_NonSense(ta) \ or _is_Splicing(ta) or _is_PAIndel(ta) or \ scpred == 'Damaging' or (cln_sig in ['Pathogenic', 'Likely pathogenic']): if gene not in data[sid]: data[sid][gene] = [] data[sid][gene].append( (ta, af, cadd, gt, eqtl_flag, var, dn, cln_sig, lcr, sc_ant, snps3d_pred)) # SC-5 in intronic and UTR variants + SC-1 + SC-2 if sc == 3: # Search Criteria 3 if _is_Intronic(ta) or _is_UTR(ta) or _is_PASnv(ta) or \ _is_NonSense(ta) or _is_Splicing(ta) or _is_PAIndel(ta) \ or scpred == 'Damaging' or cln_sig in ['Pathogenic', 'Likely pathogenic']: if gene not in data[sid]: data[sid][gene] = [] data[sid][gene].append( (ta, af, cadd, gt, eqtl_flag, var, dn, cln_sig, lcr, sc_ant, snps3d_pred)) return data
def _get_sample_ids(invcf): vcfo = VCFParser(invcf) return vcfo.samples vcfo.close()