def classify_peak(peak, sample, motifs): pc_peak = (peak.contig, peak.start+peak.summit-300, peak.start+peak.summit+300) nc_peak = (peak.contig, peak.start+peak.summit-2000, peak.start+peak.summit+2000) status = [] for motif in motifs: fname = tf_peak_fnames[ (motif.tf_name, RMID_term_name_mapping[sample])][0] fp = TabixFile(fname) if peak[0] not in fp.contigs: status.append(0) continue pc_peaks = list(fp.fetch(*pc_peak)) if len(pc_peaks) > 0: status.append(1) continue nc_peaks = list(fp.fetch(*nc_peak)) if len(nc_peaks) == 0: status.append(-1) else: status.append(0) return status
def get_interval_data(genes, INT): ''' Get interval data for gene from input files in int_fns. ''' for fn, name in INT: tb = TabixFile(fn) for g in genes: # Get region for searching replication timing data g_len = g.total_length midp = round((g.start + g.stop) / 2) min_width = 10e3 # search region at least 1 kb if g_len < min_width: start = midp - round(min_width / 2) stop = midp + round(min_width / 2) gstr = '%s:%d-%d' % (g.chrom, start, stop) else: gstr = '%s:%d-%d' % (g.chrom, g.start, g.stop) # Call to tabix to get dat from bedGraph try: it_genes = tb.fetch(gstr) except ValueError: # handle regions where no interval can be made g.intervalData[name] = None continue intData = [] for itr in it_genes: if itr == '': continue itr = itr.split('\t') intData.append(float(itr[-1])) if len(intData) > 0: g.intervalData[name] = np.mean(intData) continue else: # Extend search if value not found extends0 = [50e3, 100e3, 500e3, 1e6] extends = [] for e in extends0: if e > g_len: extends.append(e) found = False for e in extends: start = max(1, midp - round(e / 2)) stop = midp + round(e / 2) gstr = '%s:%d-%d' % (g.chrom, start, stop) it_genes = tb.fetch(gstr) for itr in it_genes: if itr == '': continue itr = itr.split('\t') intData.append(float(itr[-1])) found = True if found == True: g.intervalData[name] = np.mean(intData) break if found == False: g.intervalData[name] = None return genes
def main(): args = parse_arguments() print(VariantFile(BUILD_TO_VCF[args.reference_build]).header) vcf_file = TabixFile(BUILD_TO_VCF[args.reference_build]) rsid_file = TabixFile(BUILD_TO_RSID[args.reference_build], index=f'{BUILD_TO_RSID[args.reference_build]}.csi') def rsid_to_coordinates(rsid): rs_number = int(rsid.replace('rs', '')) for row in rsid_file.fetch('rs', rs_number - 1, rs_number): chrom, pos = row.split()[2:] yield chrom, int(pos) for variant in args.variants: if COORD_REGEX.match(variant): chrom, pos = variant.split(':') chrom = chrom_to_hgvs(chrom, reference_build=args.reference_build) pos = int(pos) for row in vcf_file.fetch(chrom, pos - 1, pos): print(row) elif RSID_REGEX.match(variant): for chrom, pos in rsid_to_coordinates(variant): for row in vcf_file.fetch(chrom, pos - 1, pos): print(row) else: raise RuntimeError('Improperly formatted query')
def get_interval_data(regions, INT): ''' Get interval data for region for input files in int_fns. Computes mean data value in 100 kb window around region midpoint. ''' for fn, name in INT: tb = TabixFile(fn) for r in regions: # Get region for searching replication timing data r_len = r.length midp = round((r.start + r.stop) / 2) min_width = 10e3 # search region at least 1 kb if r_len < min_width: start = midp - round(min_width / 2) stop = midp + round(min_width / 2) rstr = '%s:%d-%d' % (r.chrom, start, stop) else: rstr = r.region_string try: it_regions = tb.fetch(rstr) except ValueError: # handle regions where no interval can be made r.intervalData[name] = None continue intData = [] for rtr in it_regions: if rtr == '': continue intData.append(float(rtr.split('\t')[-1])) if len(intData) > 0: r.intervalData[name] = np.mean(intData) continue else: # Extend search if value not found extends = [50e3, 100e3, 500e3, 1e6] found = False for e in extends: start = max(1, midp - round(e / 2)) stop = midp + round(e / 2) rstr = '%s:%d-%d' % (r.chrom, start, stop) it_regions = tb.fetch(rstr) for rtr in it_regions: if rtr == '': continue intData.append(float(rtr.split('\t')[-1])) found = True if found == True: r.intervalData[name] = np.mean(intData) break if found == False: r.intervalData[name] = None return regions
def __process_chromosome(self, chromosome_queue, tabix_reader: pysam.TabixFile): vcf = VCF() samples = vcf.get_sample_names(self.vcf_file) while True: try: chromosome, size = chromosome_queue.get() except queue.Empty: time.sleep(0.1) continue if chromosome is None: break write_header = True window_writer = open( os.path.join(self.binned_output_folder, chromosome + "_window.csv"), 'w') chunks = self.sliding_window_generator(size) print("\nScreening: {}".format(chromosome)) for start_pos, end_pos in chunks: records = tabix_reader.fetch(chromosome, start_pos, end_pos, multiple_iterators=True) vcf_arr = [SNP(line, samples) for line in list(records)] alleles_window_sample_dict = self.determine_alleles( vcf_arr, samples) self.__write_window_to_file(alleles_window_sample_dict, window_writer, chromosome, start_pos, end_pos, write_header) write_header = False window_writer.close()
class _ALLC: def __init__(self, path, region): self.f = TabixFile(path) try: self.f_region = self.f.fetch(region) except ValueError: self.f_region = TabixIterator() def readline(self): return self.f_region.next() def close(self): self.f.close()
def _match_clinvar_one_variant( variant: Variant, tabix: TabixFile, cols: List[str]) -> Optional[Dict[str, Any]]: """Match the variant to the given ClinVar tabix table. Args: variant: Variant to be matched tabix: Tabix indexed CliVar table cols: All ClinVar columns in the table Returns: None if no ClinVar match. When matched, returns a `dict` of the clinvar record, where the key ``final_clinical_significance`` stores the final clinical significance type in :class:`ClinicalSignificance`. """ try: # TabixFile.fetch will raise ValueError if the given region is out of bound row_iter = tabix.fetch( region=f"{variant.chrom}:{variant.start_pos}-{variant.end_pos}" ) except ValueError as e: # Do nothing if it's querying for a chromosome not in the ClinVar table if "could not create iterator for region" not in e.args[0]: logger.opt( exception=e).debug(f"Tabix fetch ClinVar failed: {e}") return None for row in row_iter: record = dict(zip(cols, row.split("\t"))) if (int(record["start"]) == variant.start_pos and int(record["stop"]) == variant.end_pos and record["alt"] == variant.alt_allele): if record["ref"] != variant.ref_allele: logger.warning( f"{variant!r} got a clinvar match but their reference alleles are different: " f"{variant.ref_allele!r} != {record['ref']!r}") # Parse the clinical significance of the record record[ "final_clinical_significance"] = ClinicalSignificance.parse_clinvar_record( record) return record return None
class ExploreGnomad: def __init__(self, gnomad_file, frequency_table): self.gnomad = TabixFile(gnomad_file) self.frequencies = pd.read_csv(frequency_table, sep="\t", header=None) self.frequencies.columns = ["CHR:POS", "REF", "ALT", "AF"] self.frequencies[["CHR", "POS"]] = self.frequencies["CHR:POS"].str.split( ":", expand=True) self.frequencies["POS"] = self.frequencies["POS"].astype(int) def search_position(self, chr, pos, ref, alt): query_lines = self.gnomad.fetch(chr, pos - 1, pos) for variant in query_lines: variant_split = variant.split("\t") var_ref, var_alt = variant_split[3:5] if ref == var_ref and alt == var_alt: info_line = variant_split[-1] match = re.search(";AF_nfe=([0-9.e+\\-]+);", info_line) if match: return match.group(1) return None def search_all(self, output_path): nfe_AF = [None] * len(self.frequencies) for i, row in self.frequencies.iterrows(): if i % 1000 == 0: print(f"{round(100*i/len(self.frequencies))} % Done") nfe_AF[i] = self.search_position(row["CHR"], row["POS"], row["REF"], row["ALT"]) self.frequencies["nfe_AF"] = nfe_AF self.frequencies.to_csv( output_path, sep="\t", index=None, columns=["CHR", "POS", "REF", "ALT", "AF", "nfe_AF"])
snps[row['SNP']] = row with open('DrugInfo.csv') as src: drug_info = {row['SNP']: row for row in csv.DictReader(src)} with open('okg.ped') as pop_src: # mapping: sample id -> population id populations = { indiv['Individual ID']: indiv['Population'] for indiv in csv.DictReader(pop_src, delimiter='\t') } print 'Determining genomic coordinates for sequences.' f = TabixFile('snps.sorted.txt.gz', parser=asTuple()) snp_table = {} for row in f.fetch(): _, snp, chrom, pos = row if snp in snps or snp in drug_info: snp_table[snp] = {'chromosome': chrom, 'pos': int(pos)} with open('snps.py', 'w') as dump: dump.write(WARNING) dump.write('COORDINATES = %s\n' % snp_table) dump.write('DATA = %s\n' % snps) dump.write('DRUG_INFO = %s\n' % drug_info) print 'Data written to snps.py' print 'Determining allele frequencies (using data from 1000 Genomes)' genotypes = {snp: snp_data['Code'] for snp, snp_data in snps.iteritems()} variants = list( ga4gh.search_variants(genotypes, dataset=ga4gh.OKG, repo_id='google')) # determine allele frequencies for different population freqs = {
from pysam import VariantFile from pysam import TabixFile from pyfaidx import Fasta # data files reference_file = 'S_lycopersicum_chromosomes.2.40.fa' annotation_file = 'gene_models.gff.gz' variant_file = 'tomato_snps.bcf' # load reference reference = Fasta(reference_file) # load annotations annotations = TabixFile(annotation_file) # laod variants variants = VariantFile(variant_file) # regions to query region1 = ("SL2.40ch01", 15000, 21000) region2 = ("SL2.40ch01", 20000, 70000) region1_reference = reference[region1[0]][region1[1]: region1[2]] region1_annotations = [a for a in annotations.fetch(*region1, parser=pysam.asGTF())] region1_variants = [a for a in variants.fetch(*region1)] region2_reference = reference[region2[0]][region2[1]: region2[2]] region2_annotations = [a for a in annotations.fetch(*region2, parser=pysam.asGTF())] region2_variants = [a for a in variants.fetch(*region2)]
continue row['disease'] = disease[0:-1] snps[row['SNP']] = row with open('DrugInfo.csv') as src: drug_info = {row['SNP']: row for row in csv.DictReader(src)} with open('okg.ped') as pop_src: # mapping: sample id -> population id populations = {indiv['Individual ID']: indiv['Population'] for indiv in csv.DictReader(pop_src, delimiter='\t')} print 'Determining genomic coordinates for sequences.' f = TabixFile('snps.sorted.txt.gz', parser=asTuple()) snp_table = {} for row in f.fetch(): _, snp, chrom, pos = row if snp in snps or snp in drug_info: snp_table[snp] = { 'chromosome': chrom, 'pos': int(pos) } with open('snps.py', 'w') as dump: dump.write(WARNING) dump.write('COORDINATES = %s\n'% snp_table) dump.write('DATA = %s\n'% snps) dump.write('DRUG_INFO = %s\n'% drug_info) print 'Data written to snps.py' print 'Determining allele frequencies (using data from 1000 Genomes)' genotypes = {snp: snp_data['Code'] for snp, snp_data in snps.iteritems()} variants = list(ga4gh.search_variants(genotypes, dataset=ga4gh.OKG, repo_id='google'))
class IndexedBedFile(DataSource): name = "indexed_bedfile" version = "0.1.0" container = "dataframe" partition_access = False description = "A bgzipped and indexed bedfile" def __init__(self, urlpath, include_unmapped=True, metadata=None): self._urlpath = urlpath self._include_unmapped = include_unmapped self._dataset = None self._dtype = None self._chroms = None super(IndexedBedFile, self).__init__(metadata=metadata) def _open_dataset(self): self._dataset = TabixFile(self._urlpath) def _get_schema(self): if self._dataset is None: self._open_dataset() self._chroms = list(self._dataset.contigs) rec = next(self._dataset.fetch(self._chroms[0], parser=asTuple())) num_fields = len(rec) chrom_coord_dtype = np.int64 dtypes = { "chrom": pd.CategorialDtype(self._chroms + ["NULL"], ordered=True), "start": chrom_coord_dtype, "end": chrom_coord_dtype, "name": str, "score": np.float32, "strand": bool, } self._dtype = { key: dtypes[key] for key in list(dtypes.keys())[:num_fields] } return Schema( datashape=None, dtype=self._dtype, shape=(None, len(self._dtype)), npartitions=len(self._chroms), extra_metadata={}, ) def _get_partition(self, i): chrom = self._chroms[i] columns = list(self._dtype.keys()) return pd.DataFrame(list(self._dataset.fetch(chrom, parser=asTuple())), columns=columns).astype(self._dtype) def read(self): self._load_metadata() return pd.concat( [self.read_partition(i) for i in range(self.npartitions)], ignore_index=True) def _close(self): # close any files, sockets, etc if self._dataset is not None: self._dataset.close()