def get_cadd(mut, path='/u/sshuai/sshuai/func_score/cadd/v1.3'): ''' Get CADD scores with tabix ''' # make chrom string mut['chrom'] = mut.chrom.astype(str) # create return table # SNP only. TO DO: ADD MNP and INDEL Support keep = mut['type'].isin(['SNP', 'DEL', 'INS']) cadd = mut[keep].copy() if cadd.shape[0] == 0: logger.warning('No mutations left in CADD adjustment') return None logger.info('Retrieving CADD SNP Scores') # name for version 1.3. # A single file for SNP. # Use pre-computed PCAWG indel scores snp = 'whole_genome_SNVs.tsv.gz' indel = 'PCAWG.INDELS.CADD.v1.3.tsv.gz' snp_path = os.path.join(path, snp) indel_path = os.path.join(path, indel) assert os.path.isfile( snp_path), 'Cannot find CADD SNP scores in {}'.format(snp_path) assert os.path.isfile( indel_path), 'Cannot find CADD PCAWG indel scores in {}'.format( indel_path) # open one CADD tb_snp = tabix.open(snp_path) tb_indel = tabix.open(indel_path) # row apply func = lambda x: query_cadd(x[3], tb_snp, tb_indel, x[0], x[1], x[2], x[4], x[5]) cadd['fscore'] = cadd.apply(func, axis=1) return cadd
def get_variants_by_tabix(sample_vcf, contig=None, start=None, end=None, query_str=None, reference_vcf=None): """ :param sample_vcf: str or pytabix handler; :param contig: str; :param start: int; :param end: int; :param query_str: int; :param reference_vcf: str or pytabix handler; :return: list; list of dict """ if isinstance(sample_vcf, str): # Open sample VCF sample_vcf = tabix.open(sample_vcf) if query_str: records = sample_vcf.querys(query_str) else: records = sample_vcf.query(contig, start, end) if reference_vcf and len( list(records) ) == 0: # If sample does not have the record, query reference if given if isinstance(reference_vcf, str): # Open reference VCF reference_vcf = tabix.open(reference_vcf) records = reference_vcf.query(contig, start - 1, end) return [parse_variant(r) for r in records]
def __init__(self, input_path, blacklist_regions=None, bases_order=None): """ Constructs a `Genome` object. """ self.genome = pyfaidx.Fasta(input_path) self.chrs = sorted(self.genome.keys()) self.len_chrs = self._get_len_chrs() self._blacklist_tabix = None if blacklist_regions == "hg19": self._blacklist_tabix = tabix.open( pkg_resources.resource_filename( "selene_sdk", "sequences/data/hg19_blacklist_ENCFF001TDO.bed.gz")) elif blacklist_regions == "hg38": self._blacklist_tabix = tabix.open( pkg_resources.resource_filename( "selene_sdk", "sequences/data/hg38.blacklist.bed.gz")) elif blacklist_regions is not None: # user-specified file self._blacklist_tabix = tabix.open(blacklist_regions) if bases_order is not None: bases = [str.upper(b) for b in bases_order] self.BASES_ARR = bases lc_bases = [str.lower(b) for b in bases] self.BASE_TO_INDEX = { **{b: ix for (ix, b) in enumerate(bases)}, **{b: ix for (ix, b) in enumerate(lc_bases)} } self.INDEX_TO_BASE = {ix: b for (ix, b) in enumerate(bases)} self.update_bases_order(bases)
def init_db( self, gene_file="/datd/huboqiang/test_hESC/database/refGene.up2000_down2000.promoter.Bsorted.longestTid.bed", motifBed_file="/data/Analysis/huboqiang/software/encode-motifs-v1.3/matches.txt.gz" ): """ reference file used. """ self.file_geneTSS_tb = tabix.open(gene_file) self.file_motifBed_tb = tabix.open(motifBed_file)
def _unpicklable_init(self): if not self.initialized: self.genome = pyfaidx.Fasta(self.input_path) self.chrs = sorted(self.genome.keys()) self.len_chrs = self._get_len_chrs() self._blacklist_tabix = None if self.blacklist_regions == "hg19": self._blacklist_tabix = tabix.open( pkg_resources.resource_filename( "selene_sdk", "sequences/data/hg19_blacklist_ENCFF001TDO.bed.gz")) elif self.blacklist_regions == "hg38": self._blacklist_tabix = tabix.open( pkg_resources.resource_filename( "selene_sdk", "sequences/data/hg38.blacklist.bed.gz")) elif self.blacklist_regions is not None: # user-specified file self._blacklist_tabix = tabix.open(self.blacklist_regions) self.lens = np.array([self.len_chrs[c] for c in self.chrs]) self.inds = { c: ind for c, ind in zip( self.chrs, np.concatenate([[0], np.cumsum(self.lens)])) } if self.memmapfile is not None and os.path.isfile(self.memmapfile): # load memmap file self.sequence_data = np.memmap(self.memmapfile, dtype="float32", mode="r") self.sequence_data = np.reshape( self.sequence_data, (4, int(self.sequence_data.shape[0] / 4))) else: # convert all sequences into encoding self.sequence_data = np.zeros((4, self.lens.sum()), dtype=np.float32) for c in self.chrs: sequence = self.genome[c][:].seq encoding = self.sequence_to_encoding(sequence) self.sequence_data[:, self.inds[c]:self.inds[c] + self.len_chrs[c]] = encoding.T if self.memmapfile is not None: # create memmap file mmap = np.memmap(self.memmapfile, dtype="float32", mode="w+", shape=self.sequence_data.shape) mmap[:] = self.sequence_data self.sequence_data = np.memmap( self.memmapfile, dtype="float32", mode="r", shape=self.sequence_data.shape) self.initialized = True
def getJSONObject(params, positives, negatives, tbCladeSNPsFile, tbSNPcladesFile, snpPanelConfigFile): tbSNPclades = tabix.open(tbSNPcladesFile) tbCladeSNPs = tabix.open(tbCladeSNPsFile) uniqPositives = getUniqueSNPsetTabix(positives, tbSNPclades) uniqNegatives = getUniqueSNPsetTabix(negatives, tbSNPclades) conflicting = uniqPositives.intersection(uniqNegatives) uniqPositives = uniqPositives.difference(conflicting) uniqNegatives = uniqNegatives.difference(conflicting) if len(uniqPositives) == 0: return {"error": "unable to determine clade due to no positive SNPs"} warning = None if len(conflicting) > 0: warning = "conflicting calls for same SNP with names " + ", ".join( list(conflicting)) (ranked, hierarchy) = getRankedSolutionsScratch(uniqPositives, uniqNegatives, tbCladeSNPs, tbSNPclades) if "all" in params: result = [] for r in ranked: clade = r[1] score = r[4] result.append( decorateJSONObject(params, clade, score, uniqPositives, uniqNegatives, tbCladeSNPs, tbSNPclades, hierarchy, snpPanelConfigFile, conflicting, warning)) return result else: if len(ranked) > 0: clade = ranked[0][1] score = ranked[0][4] decorated = decorateJSONObject(params, clade, score, uniqPositives, uniqNegatives, tbCladeSNPs, tbSNPclades, hierarchy, snpPanelConfigFile, conflicting, warning) if len(ranked) > 1 and "score" in params: clade = ranked[1][1] score = ranked[1][4] decorated["nextPrediction"] = {"clade": clade, "score": score} return decorated else: if len(positives) == 1: return { "error": "unable to find " + list(positives)[0] + " on the YFull tree" } else: return { "error": "unable to find any of " + ", ".join(positives) + " on the YFull tree" }
def init_resource(self): """ init features and other annotation resources """ for rname in ['dbsnp']: if self.config.has_option(self.rv, 'dbsnp'): import tabix self.resources['dbsnp'] = tabix.open(self.config.get(self.rv, 'dbsnp')) self.features = [] for rname in self.config.options(self.rv): featdb = self.config.get(self.rv, rname) if featdb.endswith('.featuredb'): self.features.append((rname,tabix.open(featdb)))
def get_eigen(mut, path='/u/sshuai/sshuai/func_score/eigen/v1.1', coding=True): ''' Get eigen scores with tabix ''' # make chrom string mut['chrom'] = mut.chrom.astype(str) # eigen only support chr 1-22 valid_chrom = [str(i) for i in range(1, 23)] mut = mut[mut.chrom.isin(valid_chrom)] # create return table # SNP only. TO DO: ADD MNP and INDEL Support keep = mut['type'] == 'SNP' # chrom != X, Y eigen = mut[keep].copy() if eigen.shape[0] == 0: logger.warning('No mutations left in Eigen adjustment') return None if coding: logger.info('Retrieving Eigen Coding Scores') # name for version 1.1. A single file for coding. name = 'Eigen_hg19_coding_annot_04092016.tab.bgz' file_path = os.path.join(path, name) assert os.path.isfile( file_path), 'Cannot find eigen coding in {}'.format(file_path) # open one eigen tb = tabix.open(file_path) # row apply func = lambda x: query_eigen_SNP(tb, x[0], x[1], x[2], x[4], x[5]) eigen['fscore'] = eigen.apply(func, axis=1) else: logger.info('Retrieving Eigen Non-Coding Scores') # One file per chrom for non-coding. 22 in total. file_dict = { str(i): os.path.join(path, 'Eigen_hg19_noncoding_annot_chr{}.tab.bgz'.format(i)) for i in range(1, 23) } # check files, must be 22 True file_names = np.array(list(file_dict.values())) check_file = np.array([os.path.isfile(f) for f in file_names]) assert np.sum( check_file) == 22, 'Cannot find eigen noncoding in {}'.format( ", ".join(file_names[~check_file])) # open 22 eigen files file_dict = {k: tabix.open(v) for k, v in list(file_dict.items())} # row apply func = lambda x: query_eigen_SNP(file_dict[str(x[0])], x[0], x[1], x[ 2], x[4], x[5]) eigen['fscore'] = eigen.apply(func, axis=1) return eigen
def init_resource(self): for rname in ['dbsnp']: if self.config.has_option(self.rv, 'dbsnp'): import tabix self.resources['dbsnp'] = tabix.open( self.config.get(self.rv, 'dbsnp'))
def get_job_results(job_id, job=None): filters = request.args.to_dict() epacts_filename = job.relative_path("output.epacts.gz") with gzip.open(epacts_filename, "rt") as f: header = f.readline().rstrip('\n').split('\t') if header[1] == "BEG": header[1] = "BEGIN" if header[0] == "#CHROM": header[0] = "CHROM" assert len(header) > 0 headerpos = {x:i for i,x in enumerate(header)} if filters.get("region", ""): tb = tabix.open(epacts_filename) indata = tb.query(chrom, start_pos, end_pos) else: indata = (x.split("\t") for x in gzip.open(epacts_filename)) pass_tests = [] if filters.get("non-monomorphic", False): if "AC" not in headerpos: raise Exception("Column AC not found") ac_index = headerpos["AC"] def mono_pass(row): if float(row[ac_index])>0: return True else: return False pass_tests.append(mono_pass) if "max-pvalue" in filters: if "PVALUE" not in headerpos: raise Exception("Column PVALUE not found") pval_index = headerpos["PVALUE"] thresh = float(filters.get("max-pvalue", 1)) def pval_pass(row): if row[pval_index] == "NA": return False if float(row[pval_index])<thresh: return True else: return False pass_tests.append(pval_pass) def pass_row(row): if len(pass_tests)==0: return True for f in pass_tests: if not f(row): return False return True def generate(): yield "\t".join(header) + "\n" next(indata) #skip header for row in indata: if pass_row(row): yield "\t".join(row) return Response(generate(), mimetype="text/plain")
def getLeadSNPs(chrom, snps, IndSigSNPs, params): leadSNPs = [] checked = [] IndSigSNPs = IndSigSNPs[IndSigSNPs[:, 4].astype(float).argsort()] for snp in IndSigSNPs: if snp[1] in checked: continue ldfile = params.refgenome_dir + '/' + params.refpanel + '/' + params.pop + '/' + params.pop + '.chr' + str( snp[2]) + '.ld.gz' tb = tabix.open(ldfile) ld_tmp = tb.querys(snp[2] + ":" + snp[3] + "-" + snp[3]) inSNPs = [] inSNPs.append(snp[1]) for l in ld_tmp: if float(l[6]) < params.r2_2: continue if int(l[1]) != int(snp[3]): continue if int(l[4]) in IndSigSNPs[:, 3].astype(int): rsID = IndSigSNPs[IndSigSNPs[:, 3].astype(int) == int(l[4]), 1][0] checked.append(rsID) inSNPs.append(rsID) leadSNPs.append([ snp[0], snp[1], snp[2], snp[3], snp[4], str(len(inSNPs)), ";".join(inSNPs) ]) leadSNPs = np.array(leadSNPs) leadSNPs = leadSNPs[leadSNPs[:, 3].astype(int).argsort()] return leadSNPs
def getChr15(filedir, snps, Chr15, Chr15cells, chr15dir): if int(Chr15) == 1: annot = pd.read_table(filedir + "annot.txt", sep="\t") annothead = list(annot.columns.values) annot = annot.as_matrix() annot = annot[ArrayIn(annot[:, 0], snps[:, 0])] if Chr15cells[0] == "all": Chr15cells = list(annothead[3:len(annothead)]) for c in Chr15cells: snps = np.c_[snps, annot[:, annothead.index(c)]] Chr15data = [] chrom = int(snps[0, 1]) start = min(snps[:, 2]) end = max(snps[:, 2]) if end - start == 0: end += 500 start -= 500 for i in Chr15cells: tb = tabix.open(chr15dir + "/" + str(i) + "_core15.bed.gz") tmp = tb.querys(str(chrom) + ":" + str(start) + "-" + str(end)) for l in tmp: if int(l[1] < start): l[1] = str(start) if int(l[2] > str(end)): l[2] = str(end) Chr15data.append([i, int(l[1]), int(l[2]), int(l[3])]) # Chr15data = np.array(Chr15data) return [snps, Chr15data] else: return [snps, []]
def compute_1000genomes_prs(): url = "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20100804/" url += "ALL.2of4intersection.20100804.genotypes.vcf.gz" tb = tabix.open(url) records = tb.query("1", 752720, 752721) for record in records: print record
def compute_prs(raw_genotype_file, variants): counter = 0 prs_score = 0.0 url = "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20100804/" url += "ALL.2of4intersection.20100804.genotypes.vcf.gz" tb = tabix.open(url) bed_file = open(raw_genotype_file.split(".")[0] + ".bed", "wb") with open(raw_genotype_file, "rb") as genotypes: for line in genotypes: if (line[0] == "#"): continue columns = line.split("\t") bed_file.write(columns[1] + "\t" + str(int(columns[2]) - 1) + "\t" + str(int(columns[2])) + "\n") try: genotype = columns[3][0] + ":" + columns[3][1] variant = columns[1] + ":" + columns[2] + ":" + genotype score = variants[variant] prs_score += float(score) except: continue counter += 1 print(counter) bed_file.close() return prs_score
def checkAndOpen(db): if 'tabix' not in sys.modules: return None db = os.path.expanduser(db) if not os.path.exists(db): pass else: # 'source' is a variable used to title the column in the output # it is defined by the user in the configuration script step when generating the JSON file if os.path.splitext(db)[1] == ".gz" and os.path.exists(db + ".tbi"): try: database = gzip.open(db) except IOError: print("WARNING: could not open {}".format(db)) return None elif os.path.splitext(db)[1] == ".vcf": abortWithMessage("Error: database file {0} must compressed with bgzip".format(db)) elif os.path.splitext(db)[1] == ".gz" and not os.path.exists(db + ".tbi"): abortWithMessage("Compressed database is not tabix indexed") else: abortWithMessage("Error opening database files: {0}".format(db)) try: row = database.readline() except StopIteration: print("Empty file {}".format(db)) return None tb = tabix.open(db) return tb
def getNonCandidateSNPs(filedir, snps, min_pos, max_pos): chrom = int(snps[0, 1]) chrcol = 0 poscol = 1 tb = tabix.open(filedir + "all.txt.gz") tb_snps = tb.querys( str(chrom) + ":" + str(min_pos - 500000) + "-" + str(max_pos + 500000)) tmp = [] for l in tb_snps: tmp.append([int(l[0]), int(l[1]), float(l[2])]) tmp = np.array(tmp) tmp = tmp[ArrayNotIn(tmp[:, poscol], snps[:, 3])] ### filter SNPs if there are too many ##### if len(tmp) > 10000: tmp_keep = tmp[tmp[:, 2] < 0.05] tmp = tmp[tmp[:, 2] >= 0.05] step = int(len(tmp) / (10000 - len(tmp_keep))) + 1 tmp = tmp[np.arange(0, len(tmp), step)] tmp = np.r_[tmp, tmp_keep] out = [] for l in tmp: out.append([int(l[0]), int(l[1]), l[2]]) return out
def read_vcf(genotype_files, chrom, start, end): x = [] for record in tabix.open(genotype_files[chrom - 1]).query( str(chrom), start, end): dose = [_.split(':')[2] if _ != '.' else -1 for _ in record[9:]] x.append([float(_) if _ != '.' else -1 for _ in dose]) return np.array(x)
def load_exclude(file_list): exclude_list = [] if file_list is None: return exclude_list for filename in file_list: exclude_list.append(tabix.open(filename)) return exclude_list
def LoadGenotypes(gtfile, gtind, regdata): chrom = regdata["chrom"].values[0] start = min(regdata["str.start"]) end = max(regdata["str.start"]) positions = list(regdata["str.start"]) loaded_positions = [] tb = tabix.open(gtfile) records = tb.query(chrom, start - 1, end + 1) data = [] for record in records: pos = int(record[1]) if pos not in positions: continue loaded_positions.append(pos) data.append([GetFloat(record[i + 2]) for i in gtind]) # first two cols are chrom, start # assert(len(positions)==len(loaded_positions)) # assert([positions[i]==loaded_positions[i] for i in range(len(positions))]) regdata = regdata[regdata["str.start"].apply( lambda x: x in loaded_positions)] assert ([ regdata["str.start"].values[i] == loaded_positions[i] for i in range(len(loaded_positions)) ]) return data, regdata
def get_repeats(region): chrom = region.split(":")[0] self = '/uufs/chpc.utah.edu/common/home/u1021864/analysis/exacresiduals/data/self-chains.gt90.bed.gz' seg = '/uufs/chpc.utah.edu/common/home/u1021864/analysis/exacresiduals/data/hgsegmental.bed.gz' tb = tabix.open(self) rep = [] for r in tb.querys(region): s = int(r[1]) e = int(r[2]) rep.append((s, e)) tb = tabix.open(seg) for r in tb.querys(region): s = int(r[1]) e = int(r[2]) rep.append((s, e)) return rep
def pairwise_indel_finder_query(): form = pairwise_indel_form() if form.validate_on_submit(): data = form.data results = [] strain_cmp = [data["strain_1"], data["strain_2"]] tb = tabix.open(SV_BED_URL) query = tb.query(data["chromosome"], data["start"], data["stop"]) results = [] for row in query: row = dict(zip(SV_COLUMNS, row)) row["START"] = int(row["START"]) row["END"] = int(row["END"]) if row["STRAIN"] in strain_cmp and \ MIN_SV_SIZE <= int(row["SIZE"]) <= MAX_SV_SIZE: row["site"] = f"{row['CHROM']}:{row['START']}-{row['END']} ({row['SVTYPE']})" results.append(row) # mark overlaps if results: results[0]['overlap'] = False first = results[0] for idx, row in enumerate(results[1:]): row["overlap"] = overlaps(first["START"], first["END"], row["START"], row["END"]) if row["overlap"]: results[idx]['overlap'] = True first = row # Filter overlaps results = [x for x in results if x['overlap'] is False] sorted(results, key=lambda x: (x["START"], x["END"])) return jsonify(results=results) return jsonify(results=[]) return jsonify({"errors": form.errors})
def parseVCF(vcfFile, tbPositionSNPsFile): tbPositionSNPs = tabix.open(tbPositionSNPsFile) positives = [] negatives = [] if isMale(vcfFile): vcf_reader = vcf.Reader(filename=vcfFile) record = next(vcf_reader) while record: if record.CHROM == "chrY": position = str(record.POS) basesString = record.samples[0].gt_bases if basesString: allele = parseBases(basesString) if allele: (snp, call) = getPositionSNP(position, allele, tbPositionSNPs) if snp: if call == "+": positives.append(snp) else: negatives.append(snp) try: record = next(vcf_reader) except: record = None return positives, negatives
def retrive_score(mut, conf): ''' Obtain functional scores based on mut and conf ''' conf = conf.sort_values('order') score = np.empty(shape=mut.shape[0]) score[:] = np.NAN for ix_conf, conf_row in conf.iterrows(): # logger.info('Retriving {} - {} - chrom {}'.format(conf_row['name'], conf_row['type'], conf_row['chroms'])) tb = tabix.open(conf_row['path']) for ix, var in mut.iterrows(): if (var['type'] == conf_row['type'] == 'SNP' or conf_row['type'] == 'ALL' or (var['type'] in ['INS', 'DEL'] and conf_row['type'] == 'INDEL')): try: query_res = tb.query(var.chrom, var.start, var.end) except TabixError: query_res = [] # Known error, eigen coding has no chrom X, Y data. if not (conf_row['name'] in ['EIGEN_CODING', 'EIGEN_NONCODING'] and var.chrom in ['X', 'Y']): logger.warning('Retriving {} - {} score error for {}:{}-{}'\ .format(conf_row['name'], conf_row['type'], var.chrom, var.start, var.end)) if conf_row.ref_ix < 0: # no ref alt info (e.g., LINSIGHT score) score[ix] = np.mean( [float(i[conf_row.score_ix]) for i in query_res]) else: for res in query_res: if var.ref == res[conf_row.ref_ix] and var.alt == res[ conf_row.alt_ix]: score[ix] = float(res[conf_row.score_ix]) return score
def process_identical_query(query_obj): """ Loops through the output of a 'identical' query and processes all entries. The function used to update the entries is defined before the loop to reduce the if-clauses evaluated in every iteration (slight performance increase) """ # Execute the tabix query, skip query if empty result try: tb = tabix.open(query_obj.originalFile) records = tb.query(query_obj.oldSeqID, 0, query_obj.oldSeqLength) except tabix.TabixError: return # get frequently used attributes for faster lookup newID = query_obj.newSeqID # open files # lock is acquired then released after writing is done try: lock.acquire() with open(query_obj.dependentFile, 'a') as updated_file: # Loop through tabix output for entry in records: # Modify the oldID with the newID entry[0] = newID # check if the updated coordinate is negative. if yes, the entry is discarded if int(entry[1]) < 0: continue updated_file.write("\t".join(entry)) updated_file.write("\n") finally: lock.release()
def getVariants(chrom, start_pos, window_size): directory = SNP_DIR # directory where .gz and .gz.tbi files are stored fn = chrom + FILE_END readFile = os.path.join(directory, fn) # make sure inputs are integers start_pos = int(start_pos) window_size = int(window_size) - 1 # open the tabix tb = tabix.open(readFile) # query for the position end_pos = start_pos + window_size print("grabbing variants from {} {} {}".format(chrom, str(start_pos), str(end_pos))) # grab the variant records that fall between start_pos and end_pos tb_records = tb.query(chrom, start_pos, end_pos) # store tabix data into list records = [] for record in tb_records: records.append(record) return records
def intersect_region(region_file, label, dbi_file, output_folder): ''' intersect dbi with region file (bed format) and write final file in the output folder ''' fin = open(region_file, 'r') out_file = os.path.join(output_folder, label+'.bed') fout = open(out_file, 'w') try: dbi = tabix.open(dbi_file) except: print >>sys.stderr, "Can't load tabix file %s"% (dbi_file) exit(1) for line in fin: if line.strip().startswith('#') or line.strip() == '': continue row = line.strip().split() chrom = row[0] start = int(row[1]) stop = int(row[2]) result = dbi.query(chrom, start, stop) for x in result: print >>fout, '\t'.join(x) fin.close() fout.close() create_tabix(out_file)
def gene_activity_matrix(fragments, features, barcodes): ''' Computes the activity of each feature in scATAC-seq fragments : fragment file that is bgzipped (provided by cellranger) features: chr, start, end, gene barcodes: list of barcodes returns: gene activity matrix ''' tb = tabix.open(fragments) gene_activity = np.zeros((len(barcodes), len(features))) barcode_lookup = dict(zip(barcodes, np.arange( 1, 1 + len(barcodes)))) #hashmap to correspond barcodes with index in matrix for i in range(features.shape[0]): chrom, start, end = features.iloc[i, [0, 1, 2]] fragment_df = utils.read_tabix(tb, (chrom, start, end)) if fragment_df.shape[0] > 0: curr_barcodes = fragment_df[3].values for b in curr_barcodes: z = barcode_lookup.get(b) if z: gene_activity[z - 1, i] += 1 if i % 1000 == 0 or i == features.shape[0]: percent_complete = str( np.round(100 * ((i + 1) / features.shape[0]), decimals=2)) print('\r Progress: ' + percent_complete + '%', end="") gene_activity = pd.DataFrame(gene_activity, index=barcodes, columns=features['gene'].values) return gene_activity
def extract_CADD_score(arguments, q): vcf_record, caddfile = arguments tb = tabix.open(caddfile) chromosome = (vcf_record.CHROM).replace("chr","") vcf_record.INFO["RAWCADD"] = 0 vcf_record.INFO["PHREDCADD"] = 0 # Specific for CADD files # FIXME: get info about chr or not from provided VCF file records = tb.query(chromosome, vcf_record.POS-1, vcf_record.POS) # Look for matching mutation # Works for SNVs, InDels optimisation is ongoing for rec in records: if rec[3] == vcf_record.ALT[0]: # FIXME: Make requested fields optional through arguments vcf_record.INFO["RAWCADD"] = rec[4] vcf_record.INFO["PHREDCADD"] = rec[5] break # workaround since multiprocess can't handle VCF record class objects # FIXME: use VCF class records rather than this ugly string annotated = VCF_WRITER._map(str, [vcf_record.CHROM, vcf_record.POS, vcf_record.ID, vcf_record.REF]) + [VCF_WRITER._format_alt(vcf_record.ALT), str(vcf_record.QUAL) or '.', VCF_WRITER._format_filter(vcf_record.FILTER), VCF_WRITER._format_info(vcf_record.INFO)] # Return results to Queue q.put(annotated) return(annotated)
def get_cadd(config, chrom, start, ref, alt): ''' add cadd of variant ''' tabix_fp = config['data_paths']['cadd']['whole_genome_cadd'] try: tb = tabix.open(tabix_fp) except: logging.warning('{0} not available'.format(tabix_fp)) return np.nan '''if stop != start: print('WARNING: the start {0} is different than stop {1}'.format(start, stop)) return np.nan''' try: records = tb.querys(str(chrom) + ':' + str(start) + '-' + str(start)) except: logging.warning('Error when trying to query {0}-{1}-{2}-{3}'.format(chrom, start, ref, alt)) return np.nan for record in records: if record[2] != ref: logging.warning('Reference {0} is not the one in CADD for entry {1}-{2}-{3}-{4}'.format(ref, chrom, start, ref, alt)) return np.nan if record[3] == alt: return float(record[5]) logging.warning('I do not find a cadd entry for {0}-{1}-{2}-{4}'.format(alt, chrom, start, ref, alt)) return np.nan
def calc_all(all_genes, bases_to_exclude, rscu_fh, gerp_fp, genome_fa, syn_gerp_out, bed_out): """ Calculates mean gerp score for all Gene objects contained in list all_genes and writes values to outfile :param all_genes: dict of Gene objects :param gerp_fp: path to tabix-indexed gerp file :param genome_fa: path to reference genome fasta that has been indexed via samtools faidx :param outfile: path to output file """ with gzip.open(gerp_fp, 'rt') as gerp_f: gerp_header = gerp_f.readline() gerp_header = gerp_header.strip().split("\t") gerp_tb = tabix.open(gerp_fp) genome = pyfaidx.Fasta(genome_fa) rscu = read_rscu_f(rscu_fh) syn_gerp_out.write("#GENE\tSYN_GERP\n") bed_out.write("#CHROM\tPOS\tSTRAND\tGENE\tCDS_POS\tCODON\tRSCU\tGERP\n") for gene_obj in all_genes.values(): gene_obj.calc_syn_gerp(genome, gerp_header, gerp_tb, rscu, bases_to_exclude) syn_gerp_out.write("{}\t{}\n".format(gene_obj.gene, \ gene_obj.syn_gerp)) for line in gene_obj.bed: bed_out.write(line)
def query_fragments(fragment_file, chrom, start, end): """ Counts number of fragments per barcode in fragment file. Parameters ---------- fragment_file: path to fragment file chrom: chromosome to query start: start of query region end: end of query region Returns ------- records: fragments in given region. """ tb = tabix.open(fragment_file) results = tb.querys("%s:%d-%d" % (chrom, start, end)) records = [] for record in results: records.append(record) return records
def do_peak_feat_row(map_args, def_param=(scores1, scores2)): """ Loop definition for multithreading over table rows within add_peak_features. """ (i, train, BED, opt) = map_args peaks = tb.open(BED) row = train.iloc[i] anchor1, anchor2 = prepare_anchors(row, opt.extension) feats1 = get_features(anchor1.chrom, anchor1.start, anchor1.end, peaks, [ "chrom", "chromStart", "chromEnd", "name", "score", "strand", "signalValue", "pValue", "qValue", "peak" ], [ "string", "int64", "int64", "string", "int64", "string", "float64", "float64", "float64", "int64" ]) feats2 = get_features(anchor2.chrom, anchor2.start, anchor2.end, peaks, [ "chrom", "chromStart", "chromEnd", "name", "score", "strand", "signalValue", "pValue", "qValue", "peak" ], [ "string", "int64", "int64", "string", "int64", "string", "float64", "float64", "float64", "int64" ]) score1 = choose_feat(feats1, "signalValue", opt.collapse_peaks) score2 = choose_feat(feats2, "signalValue", opt.collapse_peaks) lock.acquire() scores1[i] = (score1 + score2) / 2.0 scores2[i] = np.std([score1, score2]) lock.release()
def get_exons(chrom, start, stop, file): tb = tabix.open(file) records = tb.query(chrom, start, stop) exons = [] for record in records: exons.append(record) return exons
def Query(tbk_file=None,seqname=None,start=1,end=2, idx=None,dtype=None,base_idx=8192): """ The main function for querying. tbk_file: input a single .tbk file. seqname: Chromosome (or the sequence name for tabix). start: start position. end: end position. base_idx: Number of index that should be skipped. """ if dtype is None: ver,dtype,num,idx1=Header(tbk_file) dtype=dtype_map_rev[dtype] if idx is None: idx=idx1 fmt=dtype_fmt[dtype] tb = tabix.open(idx) records=tb.query(seqname,start,end) lineNum=[record[3] for record in records] if len(lineNum)==0: return None elif len(lineNum)==1: return read_one_site(tbk_file,int(lineNum[0]),fmt,base_idx) n1,n2=lineNum[0],lineNum[-1] return read_multi_site(tbk_file,n1,n2,fmt,base_idx)
def __init__(self, _snp, _ref, _vcf, _restrict, \ _num_ctrls, _window, _match_context): self.snp = _snp self.ref = pyfasta.Fasta(_ref) self.vcf = tabix.open(_vcf) if _restrict is not None: self.restrict = tabix.open(_restrict) else: self.restrict = None self.chromToKey = {} for k in self.ref.keys(): chrom = k.split()[0] self.chromToKey[chrom] = k self.num_ctrls = _num_ctrls self.window = _window self.match_context = _match_context if self.match_context >= 0: self.snp_context = self.GetContext(self.snp)
def main(args): chrom, coords = loadCoords(args.bedFile) tb = "" if chrom: tabix.open( "/home/evansj/me/data/ExAC/coverage/ftp.broadinstitute.org/pub/ExAC_release/current/coverage/Panel.chr%s.coverage.txt.gz" % (chrom,) ) with open(args.outFile, "w") as fout: if chrom: for st in coords: # st is 1-idx in coords # tabix needs 0-based records = tb.query(chrom, st - 1, st) for record in records: thisChrom, pos, mean, median, c1, c5, c10, c15 = record[0:8] print("\t".join((thisChrom, pos, c10)), file=fout)
def tabix_vcf(vcf_file, in_chr, in_start, in_stop): """A generator to get records in a VCF given a location.""" chrom = str(in_chr); start = int(in_start); stop = int(in_stop) try: vcf_tb = tabix.open(vcf_file) for rec in vcf_tb.query(chrom, start, stop): yield rec except: return
def get_tabixhandle(path): """Check if a file is zipped and that the index exists If something looks wierd raise a TabixError """ if not path.endswith('.gz'): raise TabixError("File {0} does not end with '.gz'".format(path)) index_file = path + '.tbi' if not os.path.isfile(index_file): raise TabixError("No index could be found for {0}".format(path)) return tabix.open(path)
def __init__(self, args): self.args = args # parse out TransciptInfos print('Loading transcripts...', file=sys.stderr) self.tx_infos = self._parse_tx_infos(args.gencode_gtf) self.tx_info_by_id = dict([(info.transcript_id, info) for info in self.tx_infos]) # open tabix file print('Opening tabix file...', file=sys.stderr) self.tabix = tabix.open(args.gencode_gtf) # open BAM file and iterate over it print('Opening BAM file...', file=sys.stderr) self.sam_file = pysam.AlignmentFile(args.alignment_bam, 'r')
def ld_expand(df, ld_beds): """ Expand a set of SNVs into all SNVs with LD >= 0.8 and return a BedTool of the expanded SNPs. Parameters ---------- df : pandas.DataFrame Pandas dataframe with SNVs. The index is of the form chrom:pos where pos is the one-based position of the SNV. The columns are chrom, start, end. chrom, start, end make a zero-based bed file with the SNV coordinates. ld_beds : dict Dict whose keys are chromosomes and whose values are filenames of tabixed LD bed files. The LD bed files should be formatted like this: chr1 14463 14464 14464:51479:0.254183 where the the first three columns indicate the zero-based coordinates of a SNV and the the fourth column has the one-based coordinate of that SNV, the one-based coordinate of another SNV on the same chromosome, and the LD between these SNVs (all separated by colons). Returns ------- bt : pybedtools.BedTool BedTool with input SNVs and SNVs they are in LD with. indepdent SNVs. """ import pybedtools as pbt import tabix out_snps = [] for chrom in ld_beds.keys(): t = tabix.open(ld_beds[chrom]) tdf = df[df['chrom'].astype(str) == chrom] for ind in tdf.index: p = tdf.ix[ind, 'end'] out_snps.append('{}\t{}\t{}\t{}\n'.format(chrom, p - 1, p, ind)) try: r = t.query('{}'.format(chrom), p - 1, p) while True: try: n = r.next() p1, p2, r2 = n[-1].split(':') if float(r2) >= 0.8: out_snps.append('{}\t{}\t{}\t{}\n'.format( n[0], int(p2) - 1, int(p2), ind)) except StopIteration: break except tabix.TabixError: continue bt = pbt.BedTool(''.join(out_snps), from_string=True) bt = bt.sort() return bt
def get_genotypes(CpG_location): import tabix import pandas as pd tb_file = "/path/to/file/DF_meth_variants.gz" df = pd.DataFrame(columns=xrange(0,782)) tb = tabix.open(tb_file) # print CpG_location records = tb.querys(CpG_location) num = 0 for record in records: df.loc[num] = record[3:] num += 1 return(df)
def test_same_aa_different_positions(self): ''' check that same_aa() works correctly for different amino acids ''' lines = make_vcf_header() lines.append(make_vcf_line(pos=5, extra='Protein_position=2')) lines.append(make_vcf_line(pos=7, extra='Protein_position=3')) lines.append(make_vcf_line(pos=8, extra='Protein_position=4')) self.write_vcf(lines) vcf = tabix.open(self.path) pairs = [[('1', 7), ('1', 8)]] self.assertEqual(same_aa(vcf, pairs), [])
def test_same_aa(self): ''' check that same_aa() works correctly ''' # get the VCF lines lines = make_vcf_header() lines.append(make_vcf_line(pos=2, extra='Protein_position=1')) lines.append(make_vcf_line(pos=4, extra='Protein_position=1')) self.write_vcf(lines) vcf = tabix.open(self.path) pairs = [[('1', 2), ('1', 4)]] self.assertEqual(same_aa(vcf, pairs), [[('1', 2), ('1', 4)]])
def __search_pos_bdg(self): self.pd_frame = {} for i,bdg_file in enumerate(self.l_bdg_file): tb = tabix.open( bdg_file ) record = tb.query( self.chrom, self.beg, self.end ) l_pos = [] l_cons = [] l_xticks = [ self.beg+1 ] bin_size = int( (self.end-self.beg)/10 ) bin_size = 10**int(np.log10(bin_size)) pre_pos = 0 for rec in record: for pos in xrange( int(rec[1]),int(rec[2]) ): cons = float(rec[3]) if pre_pos == 0: pre_pos = int(rec[1]) """ Only consider the given region. """ if self.__is_intersect(pos): """ If bedGraph has gaps, """ if pos > pre_pos+1: """ Using zero to fill bedGraph gaps """ for p in xrange( pre_pos+1,pos ): l_pos.append( p ) l_cons.append( 0.0 ) l_pos.append( pos ) l_cons.append(cons ) if pos % bin_size == 0: l_xticks.append( pos ) pre_pos = pos l_xticks.append( self.end ) data = { 'pos':l_pos, 'con':l_cons } self.pd_frame[ bdg_file ] = pd.DataFrame( data ) if i == 0: self.l_xpos = l_xticks self.l_xticks = [ str(tick) for tick in l_xticks ]
def __init__(self, task_queue, results_queue, families={}, phased=False, vep=False, cadd_raw=False, cadd_file=None, cadd_1000g=None, cadd_exac=None, cadd_ESP=None, cadd_InDels=None, thousand_g=None, exac=None, dbNSFP=None, strict=False, verbosity=False): Process.__init__(self) self.task_queue = task_queue self.families = families self.results_queue = results_queue self.verbosity = verbosity self.phased = phased self.vep = vep self.cadd_raw = cadd_raw self.cadd_file = cadd_file self.cadd_1000g = cadd_1000g self.cadd_exac = cadd_exac self.cadd_ESP = cadd_ESP self.cadd_InDels = cadd_InDels self.thousand_g = thousand_g self.exac = exac self.dbNSFP = dbNSFP self.strict = strict self.any_cadd_info = False if self.cadd_file: self.cadd_file = tabix.open(self.cadd_file) self.any_cadd_info = True if self.cadd_1000g: self.cadd_1000g = tabix.open(self.cadd_1000g) self.any_cadd_info = True if self.cadd_exac: self.cadd_exac = tabix.open(self.cadd_exac) self.any_cadd_info = True if self.cadd_ESP: self.cadd_ESP = tabix.open(self.cadd_ESP) self.any_cadd_info = True if self.cadd_InDels: self.cadd_InDels = tabix.open(self.cadd_InDels) self.any_cadd_info = True if self.thousand_g: self.thousand_g = tabix.open(self.thousand_g) if self.exac: self.exac = tabix.open(self.exac) if self.dbNSFP: self.exac = tabix.open(self.exac)
def get_1mb_snps(): import tabix tb = tabix.open('snps_all.gz') fname = 'newMethPosFile.txt_2-3_col_1' snps = {} with open(fname) as f: for line in f: a = line.rstrip('\n').rsplit('\t') start = str(int(a[1]) - 1000000) stop = str(int(a[1]) + 1000000) pos = a[0] + ":" + start + "-" + stop records = tb.querys(pos) for record in records: snps[record[3]] = 0 return(snps)
def test_same_aa_missing_protein_positions(self): ''' check that same_aa() works correctly when the vars aren't in the CDS ''' # if one of the variants in the pair does not have a protein position # listed (i.e. residue number), that indicates the variant could be # affecting the splice site, so we can't use the pair. lines = make_vcf_header() lines.append(make_vcf_line(pos=5)) lines.append(make_vcf_line(pos=7)) lines.append(make_vcf_line(pos=8, extra='Protein_position=4')) self.write_vcf(lines) vcf = tabix.open(self.path) pairs = [[('1', 7), ('1', 8)]] self.assertEqual(same_aa(vcf, pairs), [])
def test_screen_pairs_nonstandard_pair(self): ''' test that screen_pairs() works correctly ''' # get the VCF lines lines = make_vcf_header() lines.append(make_vcf_line(pos=2)) lines.append(make_vcf_line(pos=4)) lines.append(make_vcf_line(pos=5)) lines.append(make_vcf_line(pos=7)) lines.append(make_vcf_line(pos=8)) self.write_vcf(lines) vcf = tabix.open(self.path) # set up a list of 'pairs', where one 'pair' has three variants in it. # we exclude 'pairs' where n != 2. pairs = [[('1', 2), ('1', 4), ('1', 5)], [('1', 7), ('1', 8)]] self.assertEqual(screen_pairs(vcf, pairs, is_not_indel), [[('1', 7), ('1', 8)]])
def open_tabix_file(file_path): """docstring for open_tabix_file""" file_handle = tabix.open(file_path) try: file_handle.query('1', 1, 100) except tabix.TabixError as e: logger.warning("Something wrong with tabix file: {0}".format( file_path)) file_name, file_extension = os.path.splitext(file_path) if file_extension != '.gz': raise NotZippedError("File {0} does not seem to be bgzipped".format( file_path)) else: raise NotIndexedError("File {0} does not seem to be tabix"\ " indexed".format(file_path)) return file_handle
def gerp(vf, af, name="gerp"): v = BedTool(vf) t = tabix.open(af) results = {} for var in v: try: result = 0.0 num = 0 for res in t.query(var.chrom, var.start, var.end): result += float(res[4]) num += 1 if num > 0: results[var.name] = result/num except: pass return Series(results, name=name)
def get_exac(config, chrom, start, stop, ref, alt): ''' add exac annotation for the variant; in exac file: # AN_Adj is the overall total of alleleles (I do not know the difference as compared to AN) # AC_Adj is the overall number of mutatnt alleles observed across population # AF = AC_Adj/AN_adj ''' tabix_fp = config['data_paths']['exac']['exac'] tb = tabix.open(tabix_fp) if stop != start: logging.warning('The start {0} is different than stop {1}'.format(start, stop)) return np.nan # A query returns an iterator over the results. records = tb.querys(str(chrom) + ':' + str(start) + '-' + str(stop)) # if one single position is provided and this position exists, the iterator contains a single record (a single list) # (if multiple alleles in that position, info is comma-separated) for record in records: # no entry with pass filter if record[6] != 'PASS': return np.nan # the reference is not annotated well! if record[3] != ref: print('WARNING:the ref {0} is not matching the info in\n {1}'.format(ref, record[0:5])) alt_s = record[4].split(',') try: # i split the alt entry in case there are multiple alleles in the same record allele_pos = (alt_s.index(alt)) af_s = record[7].split('AF=')[1].split(';')[0].split(',') return float(af_s[allele_pos]) except: # the alt is not in the record and thus is not in the exac data return np.nan # the "for record in records" loop is not entered if the tabix query is empty (no entry for that position) return np.nan
def test_get_matches(self): ''' check that get_matches works correctly ''' # get the VCF lines lines = make_vcf_header() lines.append(make_vcf_line(pos=1)) lines.append(make_vcf_line(pos=2)) lines.append(make_vcf_line(pos=4)) lines.append(make_vcf_line(pos=5)) self.write_vcf(lines) vcf = tabix.open(self.path) pair = [('1', 2), ('1', 4)] # define the expected lines var1 = parse_vcf_line(make_vcf_line(pos=2).split('\t'), self.Variant) var2 = parse_vcf_line(make_vcf_line(pos=4).split('\t'), self.Variant) self.assertEqual(list(get_matches(vcf, pair)), [var1, var2])
def make_refGene_track(self,bed_tabix="/datd/huboqiang/ChIP_human/Week12/Database/refGene.sort.bed.gz"): ''' cut -f 2- refGene.txt | awk '{OFS="\t";print $1"__"$12,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$13,$14,$15}' | /data/Analysis/huboqiang/software/UCSC/genePredToBed /dev/stdin /dev/stdout | bedtools sort -i /dev/stdin >refGene.sort.bed bgzip -fc refGene.sort.bed >refGene.sort.bed.gz tabix -p bed -s 1 -b 2 -e 3 refGene.sort.bed.gz ''' tb = tabix.open( bed_tabix ) record = tb.query( self.chrom, self.beg, self.end ) self.geneGraph = {} for rec in record: tran,gene = rec[3].split("__") if gene not in self.geneGraph: self.geneGraph[ gene ] = {} if tran not in self.geneGraph[gene]: self.geneGraph[ gene ][ tran ] = { 'beg':int(rec[1]),'end':int(rec[2]), 'exon_beg':[],'exon_ext':[],'cds_beg':[],'cds_ext':[], 'strand':rec[5] } l_beg = [ int(beg)+int(rec[1]) for beg in rec[11].split(",")[:-1] ] l_ext = [ int(ext) for ext in rec[10].split(",")[:-1] ] self.geneGraph[ gene ][ tran ][ 'exon_beg' ] = l_beg self.geneGraph[ gene ][ tran ][ 'exon_ext' ] = l_ext cds_beg = int(rec[6]) cds_end = int(rec[7]) exon_cnt= int(rec[9]) if cds_beg == cds_end: continue for i in xrange( 0,exon_cnt ): beg = l_beg[i] end = l_beg[i] + l_ext[i] if cds_end < beg or cds_beg > end: continue self.geneGraph[ gene ][ tran ][ 'cds_beg' ].append( max(cds_beg,beg) ) self.geneGraph[ gene ][ tran ][ 'cds_ext' ].append( min(cds_end,end)-max(cds_beg,beg) ) self.__Only_Region() self.__Only_Longest_tran()
def test_screen_pairs(self): ''' test that screen_pairs() works correctly ''' # get the VCF lines lines = make_vcf_header() lines.append(make_vcf_line(pos=1)) lines.append(make_vcf_line(pos=2)) lines.append(make_vcf_line(pos=4)) lines.append(make_vcf_line(pos=5)) lines.append(make_vcf_line(pos=7)) lines.append(make_vcf_line(pos=8)) self.write_vcf(lines) vcf = tabix.open(self.path) pairs = [[('1', 2), ('1', 4)], [('1', 7), ('1', 8)]] self.assertEqual(screen_pairs(vcf, pairs, is_not_indel), pairs) # check that the other filter function also works cleanly self.assertEqual(screen_pairs(vcf, pairs, is_coding), pairs)
def get_coverage_stats(chrom, pos_start, pos_end): tb = tabix.open(COVERAGE_FOLDER + COVERAGE_FILE % chrom) records = tb.query(chrom, int(pos_start) - EXON_PADDING, int(pos_end) + EXON_PADDING) pos_means = [] for record in records: pos_means.append(float(record[COVERAGE_POS_MEAN_INDEX])) coverage_stats = [] # Check that there is coverage data for the exon if pos_means: coverage_mean = "{0:.3f}".format(numpy.mean(pos_means)) coverage_standart_deviation = "{0:.3f}".format(numpy.std(pos_means)) coverage_max = max(pos_means) coverage_min = min(pos_means) coverage_stats = [coverage_mean, coverage_standart_deviation, coverage_max, coverage_min] else: coverage_stats = [0, 0, 0, 0] return coverage_stats
def get_mnv_candidates(path): ''' identify MNV candidates, and their MNV consequences within a VCF. Args: path: path to VCF Returns: list of (variant, mnv_consequence) tuples, where variant is (chrom, pos) ''' with open_vcf(path) as vcf: exclude_header(vcf) header = get_vcf_header(vcf) pairs = find_nearby_variants(vcf) # ensure variants are not indels, are coding, and pairs alter the same amino # acid position vcf = tabix.open(path) pairs = screen_pairs(vcf, pairs, is_not_indel) pairs = screen_pairs(vcf, pairs, is_coding) pairs = same_aa(vcf, pairs) pattern = re.compile('[ACGT]') candidates = {} for pair in pairs: var1, var2 = list(get_matches(vcf, pair)) try: cq = check_mnv_consequence(var1, var2, pattern) candidates[pair[0]] = cq candidates[pair[1]] = cq except AssertionError: print('{0}:{1} and {0}:{2} in {3} have multiple alternative ' \ 'transcripts or odd codon sequences'.format(var1.chrom, var1.pos, var2.pos, path)) return candidates
#! /usr/local/bin/python import sys, tabix from scipy import stats inputFile = sys.argv[1] tumorDepth = sys.argv[2] normalDepth = sys.argv[3] hIN = open(inputFile, 'r') tumorDepth_tb = tabix.open(tumorDepth) normalDepth_tb = tabix.open(normalDepth) margin1 = 1000 margin2 = 500000 thres = 50 # for sorting def cmp_chrPos(x1, x2): key1 = x1.split('\t') key2 = x2.split('\t') if key1[0] < key2[0]: return 1 elif key1[0] > key2[0]: return -1 else: if int(key1[1]) >= int(key2[1]): return 1 else: return -1
prog_name = sys.argv[0].split('/')[-1] if len(sys.argv) == 4: in_vcf = sys.argv[1] in_db_dir = sys.argv[2] maf_cut = float(sys.argv[3]) print >> sys.stderr, "[%s] %s run initiated." % (time.ctime(), prog_name) else: sys.exit("\nUsage: python %s <in.vcf> <in.EVS.vcf.gz.dir> <maf.cut>\n" % prog_name) # fi # Init tabix dbs = {} for chrom_id in [str(x) for x in xrange(1, 23)] + ['X', 'Y']: # no mito var in 1000G file_to_glob = "%s/ESP6500SI-V2-SSA137.*.chr%s.*.vcf.gz" % (in_db_dir, chrom_id) db_file = glob.glob(file_to_glob)[0] dbs[chrom_id] = tabix.open(db_file) #db = tabix.open(in_db) # Proc VCF for line in open(in_vcf, "r"): flag_printed = False if line.startswith('#'): print line.strip() continue field = line.strip().split('\t') chrom = field[0] chrom_id = chrom.replace("chr", '') chrom_id = 'M' if chrom_id == "MT" else chrom_id one_pos = int(field[1]) chr_pos = "%s:%s" % (chrom_id, one_pos) ref = field[3]