def get_interval_data(genes, INT): ''' Get interval data for gene from input files in int_fns. ''' for fn, name in INT: tb = TabixFile(fn) for g in genes: # Get region for searching replication timing data g_len = g.total_length midp = round((g.start + g.stop) / 2) min_width = 10e3 # search region at least 1 kb if g_len < min_width: start = midp - round(min_width / 2) stop = midp + round(min_width / 2) gstr = '%s:%d-%d' % (g.chrom, start, stop) else: gstr = '%s:%d-%d' % (g.chrom, g.start, g.stop) # Call to tabix to get dat from bedGraph try: it_genes = tb.fetch(gstr) except ValueError: # handle regions where no interval can be made g.intervalData[name] = None continue intData = [] for itr in it_genes: if itr == '': continue itr = itr.split('\t') intData.append(float(itr[-1])) if len(intData) > 0: g.intervalData[name] = np.mean(intData) continue else: # Extend search if value not found extends0 = [50e3, 100e3, 500e3, 1e6] extends = [] for e in extends0: if e > g_len: extends.append(e) found = False for e in extends: start = max(1, midp - round(e / 2)) stop = midp + round(e / 2) gstr = '%s:%d-%d' % (g.chrom, start, stop) it_genes = tb.fetch(gstr) for itr in it_genes: if itr == '': continue itr = itr.split('\t') intData.append(float(itr[-1])) found = True if found == True: g.intervalData[name] = np.mean(intData) break if found == False: g.intervalData[name] = None return genes
def classify_peak(peak, sample, motifs): pc_peak = (peak.contig, peak.start+peak.summit-300, peak.start+peak.summit+300) nc_peak = (peak.contig, peak.start+peak.summit-2000, peak.start+peak.summit+2000) status = [] for motif in motifs: fname = tf_peak_fnames[ (motif.tf_name, RMID_term_name_mapping[sample])][0] fp = TabixFile(fname) if peak[0] not in fp.contigs: status.append(0) continue pc_peaks = list(fp.fetch(*pc_peak)) if len(pc_peaks) > 0: status.append(1) continue nc_peaks = list(fp.fetch(*nc_peak)) if len(nc_peaks) == 0: status.append(-1) else: status.append(0) return status
def main(): args = parse_arguments() print(VariantFile(BUILD_TO_VCF[args.reference_build]).header) vcf_file = TabixFile(BUILD_TO_VCF[args.reference_build]) rsid_file = TabixFile(BUILD_TO_RSID[args.reference_build], index=f'{BUILD_TO_RSID[args.reference_build]}.csi') def rsid_to_coordinates(rsid): rs_number = int(rsid.replace('rs', '')) for row in rsid_file.fetch('rs', rs_number - 1, rs_number): chrom, pos = row.split()[2:] yield chrom, int(pos) for variant in args.variants: if COORD_REGEX.match(variant): chrom, pos = variant.split(':') chrom = chrom_to_hgvs(chrom, reference_build=args.reference_build) pos = int(pos) for row in vcf_file.fetch(chrom, pos - 1, pos): print(row) elif RSID_REGEX.match(variant): for chrom, pos in rsid_to_coordinates(variant): for row in vcf_file.fetch(chrom, pos - 1, pos): print(row) else: raise RuntimeError('Improperly formatted query')
def __init__(self, gnomad_file, frequency_table): self.gnomad = TabixFile(gnomad_file) self.frequencies = pd.read_csv(frequency_table, sep="\t", header=None) self.frequencies.columns = ["CHR:POS", "REF", "ALT", "AF"] self.frequencies[["CHR", "POS"]] = self.frequencies["CHR:POS"].str.split( ":", expand=True) self.frequencies["POS"] = self.frequencies["POS"].astype(int)
def get_interval_data(regions, INT): ''' Get interval data for region for input files in int_fns. Computes mean data value in 100 kb window around region midpoint. ''' for fn, name in INT: tb = TabixFile(fn) for r in regions: # Get region for searching replication timing data r_len = r.length midp = round((r.start + r.stop) / 2) min_width = 10e3 # search region at least 1 kb if r_len < min_width: start = midp - round(min_width / 2) stop = midp + round(min_width / 2) rstr = '%s:%d-%d' % (r.chrom, start, stop) else: rstr = r.region_string try: it_regions = tb.fetch(rstr) except ValueError: # handle regions where no interval can be made r.intervalData[name] = None continue intData = [] for rtr in it_regions: if rtr == '': continue intData.append(float(rtr.split('\t')[-1])) if len(intData) > 0: r.intervalData[name] = np.mean(intData) continue else: # Extend search if value not found extends = [50e3, 100e3, 500e3, 1e6] found = False for e in extends: start = max(1, midp - round(e / 2)) stop = midp + round(e / 2) rstr = '%s:%d-%d' % (r.chrom, start, stop) it_regions = tb.fetch(rstr) for rtr in it_regions: if rtr == '': continue intData.append(float(rtr.split('\t')[-1])) found = True if found == True: r.intervalData[name] = np.mean(intData) break if found == False: r.intervalData[name] = None return regions
class _ALLC: def __init__(self, path, region): self.f = TabixFile(path) try: self.f_region = self.f.fetch(region) except ValueError: self.f_region = TabixIterator() def readline(self): return self.f_region.next() def close(self): self.f.close()
def __process_chromosome(self, chromosome_queue, tabix_reader: pysam.TabixFile): vcf = VCF() samples = vcf.get_sample_names(self.vcf_file) while True: try: chromosome, size = chromosome_queue.get() except queue.Empty: time.sleep(0.1) continue if chromosome is None: break write_header = True window_writer = open( os.path.join(self.binned_output_folder, chromosome + "_window.csv"), 'w') chunks = self.sliding_window_generator(size) print("\nScreening: {}".format(chromosome)) for start_pos, end_pos in chunks: records = tabix_reader.fetch(chromosome, start_pos, end_pos, multiple_iterators=True) vcf_arr = [SNP(line, samples) for line in list(records)] alleles_window_sample_dict = self.determine_alleles( vcf_arr, samples) self.__write_window_to_file(alleles_window_sample_dict, window_writer, chromosome, start_pos, end_pos, write_header) write_header = False window_writer.close()
def match_clinvar(self) -> None: """Match the input variant with the ClinVar table. Update :attr:`CharGerResult.clinvar` the variant matches a ClinVar record by calling :meth:`_match_clinvar_one_variant`. """ if self.config.clinvar_table is None: logger.info("Skip matching ClinVar") return logger.info( f"Match input variants with ClinVar table at {self.config.clinvar_table}" ) clinvar_match_num = 0 with TabixFile(str(self.config.clinvar_table), encoding="utf8") as tabix: cols = tabix.header[0][len("#"):].split("\t") for result in self.results: record = self._match_clinvar_one_variant( result.variant, tabix, cols) if record is not None: result.clinvar = record clinvar_match_num += 1 logger.success( f"Matched {clinvar_match_num:,d} out of {len(self.input_variants):,d} input variants to a ClinVar record" )
def get_allc_lambda_frac(allc_list, num_upstr_bases): num_upstr_bases = int(num_upstr_bases) records = {} for path in allc_list: mc_counts = defaultdict(int) cov_counts = defaultdict(int) with TabixFile(str(path)) as allc: cell = pathlib.Path(path).name.split('.')[0] try: for line in allc.fetch('chrL'): chrom, pos, strand, context, mc, cov, _ = line.split('\t') # this will lead to only four contexts: CA, CC, CT, CG context = context[num_upstr_bases:num_upstr_bases + 2] mc_counts[context] += int(mc) cov_counts[context] += int(cov) df = pd.DataFrame({'mc': pd.Series(mc_counts), 'cov': pd.Series(cov_counts)}) df = df.reindex(['CG', 'CC', 'CT', 'CA']).fillna(0) # reindex to make all four context exist cy_cov = df.loc['CT', 'cov'] + df.loc['CC', 'cov'] if cy_cov > 0: cy_frac = (df.loc['CT', 'mc'] + df.loc['CC', 'mc']) / cy_cov else: cy_frac = 0 records[cell] = {'LambdaCYFrac': cy_frac, 'LambdaCYCov': cy_cov} except ValueError: # no chrL lines records[cell] = {'LambdaCYFrac': 0, 'LambdaCYCov': 0} records = pd.DataFrame(records).T return records
def clinvar_tabix(test_root): return TabixFile( str( test_root.joinpath( "examples/annotations/clinvar_chrom_22_only.b37.tsv.gz")), encoding="utf8", )
def get_snp_data(*args, **kwargs): ''' proxy for TabixFile.fetch ''' kwargs['multiple_iterators'] = True return TabixFile(SNP_FILE, parser=asTuple()).\ fetch(*args, **kwargs)
def vcf2chain(input_file, fasta_file, strain, output_file, vcf_keep=False, passed=False, quality=False, diploid=False): """ :param input_file: :param fasta_file: :param strain: :param output_file: :param vcf_keep: :param passed: :param quality: :param diploid: :return: """ start = time.time() input_file = g2g_fu.check_file(input_file) fasta_file = g2g_fu.check_file(fasta_file) if not strain: raise G2GValueError("No strain was specified.") output_file = g2g_fu.check_file(output_file, 'w') output_file_dir = os.path.dirname(output_file) LOG.info("VCF FILE: {0}".format(input_file)) LOG.info("FASTA FILE: {0}".format(fasta_file)) LOG.info("CHAIN FILE: {0}".format(output_file)) vcf_discard_file = None if vcf_keep: vcf_discard_file = "{0}.errors.vcf".format(os.path.basename(input_file)) vcf_discard_file = os.path.join(output_file_dir, vcf_discard_file) LOG.info("VCF DISCARD FILE: {0}".format(vcf_discard_file)) LOG.info("STRAIN: {0}".format(strain)) LOG.info("PASS FILTER ON: {0}".format(str(passed))) LOG.info("QUALITY FILTER ON: {0}".format(str(quality))) LOG.info("DIPLOID: {0}".format(str(diploid))) if not isinstance(fasta_file, FastaFile): fasta_file = FastaFile(fasta_file) tb = TabixFile(input_file) sample_index = None for h in tb.header: if h[:6] == '#CHROM': try: elems = h.split('\t') samples = elems[9:] samples = dict(zip(samples, (x for x in xrange(len(samples))))) sample_index = samples[strain] except KeyError, ke: raise G2GVCFError("Unknown strain '{0}', valid strains are: {1}".format(strain, ", ".join(samples)))
def __init__(self, chrom=None, pos=None, id=None, reference_build='GRCh38'): if chrom and pos and not id: self.chrom = (chrom_to_hgvs(chrom, reference_build=reference_build), ) self.pos = (int(pos), ) elif id and not (chrom or pos): rs_number = int(id.replace('rs', '')) self.chrom, self.pos = zip(*(row.split()[2:] for row in TabixFile( BUILD_TO_RSID[reference_build], index=f'{BUILD_TO_RSID[reference_build]}.csi').fetch( 'rs', rs_number - 1, rs_number))) self.pos = tuple(int(p) for p in self.pos) else: print('Invalid input parameters') _, _, self.id, self.ref, self.alt, _, _, self.info = zip( *(row.split() for chrom, pos in zip(self.chrom, self.pos) for row in TabixFile(BUILD_TO_VCF[reference_build]).fetch( chrom, pos - 1, pos)))
def tabix_region(bedpath, querybt): """ Uses tabix to extract all windows spanning in querybt Returns: pbt.BedTool of all windows in query """ # Format query chrom = querybt[0].chrom start = str(np.nanmin(querybt.cut(range(3)).to_dataframe().start)) end = str(np.nanmin(querybt.cut(range(3)).to_dataframe().end)) region = '{}:{}-{}'.format(chrom, start, end) # Extract all windows return pbt.BedTool('\n'.join([x for x in TabixFile(bedpath).fetch(region)]), from_string=True)
def get_snps(pid): ''' return sequences mentioned in SNPData.csv ''' coords = map(make_coord_string, snps.COORDINATES.values()) search_args = { 'coordinate': ','.join(coords), 'patient': pid, '_count': 100000 } seq_bundle = call_api('/Sequence', search_args) seqs = (entry['content'] for entry in seq_bundle['entry']) translation_f = TabixFile(SNP_TRANSLATION_FNAME, parser=asTuple()) return jsonify({ get_rsid(translation_f, seq): seq['observedSeq'] for seq in seqs })
def _match_clinvar_one_variant( variant: Variant, tabix: TabixFile, cols: List[str]) -> Optional[Dict[str, Any]]: """Match the variant to the given ClinVar tabix table. Args: variant: Variant to be matched tabix: Tabix indexed CliVar table cols: All ClinVar columns in the table Returns: None if no ClinVar match. When matched, returns a `dict` of the clinvar record, where the key ``final_clinical_significance`` stores the final clinical significance type in :class:`ClinicalSignificance`. """ try: # TabixFile.fetch will raise ValueError if the given region is out of bound row_iter = tabix.fetch( region=f"{variant.chrom}:{variant.start_pos}-{variant.end_pos}" ) except ValueError as e: # Do nothing if it's querying for a chromosome not in the ClinVar table if "could not create iterator for region" not in e.args[0]: logger.opt( exception=e).debug(f"Tabix fetch ClinVar failed: {e}") return None for row in row_iter: record = dict(zip(cols, row.split("\t"))) if (int(record["start"]) == variant.start_pos and int(record["stop"]) == variant.end_pos and record["alt"] == variant.alt_allele): if record["ref"] != variant.ref_allele: logger.warning( f"{variant!r} got a clinvar match but their reference alleles are different: " f"{variant.ref_allele!r} != {record['ref']!r}") # Parse the clinical significance of the record record[ "final_clinical_significance"] = ClinicalSignificance.parse_clinvar_record( record) return record return None
class ExploreGnomad: def __init__(self, gnomad_file, frequency_table): self.gnomad = TabixFile(gnomad_file) self.frequencies = pd.read_csv(frequency_table, sep="\t", header=None) self.frequencies.columns = ["CHR:POS", "REF", "ALT", "AF"] self.frequencies[["CHR", "POS"]] = self.frequencies["CHR:POS"].str.split( ":", expand=True) self.frequencies["POS"] = self.frequencies["POS"].astype(int) def search_position(self, chr, pos, ref, alt): query_lines = self.gnomad.fetch(chr, pos - 1, pos) for variant in query_lines: variant_split = variant.split("\t") var_ref, var_alt = variant_split[3:5] if ref == var_ref and alt == var_alt: info_line = variant_split[-1] match = re.search(";AF_nfe=([0-9.e+\\-]+);", info_line) if match: return match.group(1) return None def search_all(self, output_path): nfe_AF = [None] * len(self.frequencies) for i, row in self.frequencies.iterrows(): if i % 1000 == 0: print(f"{round(100*i/len(self.frequencies))} % Done") nfe_AF[i] = self.search_position(row["CHR"], row["POS"], row["REF"], row["ALT"]) self.frequencies["nfe_AF"] = nfe_AF self.frequencies.to_csv( output_path, sep="\t", index=None, columns=["CHR", "POS", "REF", "ALT", "AF", "nfe_AF"])
def __init__(self, path, region): self.f = TabixFile(path) try: self.f_region = self.f.fetch(region) except ValueError: self.f_region = TabixIterator()
def _open_dataset(self): self._dataset = TabixFile(self._urlpath)
def main(): """ Main block """ # Parse command line arguments and options parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('gtf', help='GTF of genes to consider.') parser.add_argument('pext', help='BED of pext scores. Must be tabixed.') parser.add_argument('--min-pext', default=0.1, type=float, help='Minimum mean pext score to retain exon. ' + '[default: 0.1]') parser.add_argument('-o', '--outgtf', help='Path to output GTF file. ' + '[default: stdout]') parser.add_argument('-z', '--bgzip', dest='bgzip', action='store_true', help='Compress output GTF with bgzip.') parser.add_argument('--lost-genes', help='Path to output file listing genes ' + 'lost due to pext filtering.') args = parser.parse_args() # Open connection to output file if args.outgtf is None \ or args.outgtf in 'stdout -'.split(): outgtf = stdout else: if path.splitext( args.outgtf)[-1] in '.gz .bz .bgz .bgzip .gzip'.split(): outgtf_path = path.splitext(args.outgtf)[0] else: outgtf_path = args.outgtf # Load GTF & pext data gtfbt, genes, transcripts = load_gtf(args.gtf) pext = TabixFile(args.pext) # Apply pext filter gtfbt, filter_stats = pext_filter(gtfbt, pext, genes, args.min_pext) gtfbt.saveas(outgtf_path) filt_msg = 'Finished. Removed {:,} exons, resulting in the loss of {:,} genes.' print( filt_msg.format(filter_stats['n_exons_lost'], filter_stats['n_genes_lost']) + '\n') # Bgzip output GTF, if optioned if args.outgtf is not None \ and args.outgtf not in 'stdout -'.split() \ and args.bgzip: subprocess.run(['bgzip', '-f', outgtf_path]) # Write out list of lost genes, if optioned if args.lost_genes is not None: with open(args.lost_genes, 'w') as lost_out: for gene in filter_stats['genes_lost']: lost_out.write(gene + '\n')
GENOME_APP_NAME = basename(GENOME_APP_DIRECTORY_PATH) INPUT_DIRECTORY_PATH = join(GENOME_APP_DIRECTORY_PATH, 'input') PERSON_DIRECTORY_PATH = join(INPUT_DIRECTORY_PATH, 'person') GRCH_DIRECTORY_PATH = join(INPUT_DIRECTORY_PATH, 'grch') TOOLS_DIRECTORY_PATH = join(GENOME_APP_DIRECTORY_PATH, 'tools') OUTPUT_DIRECTORY_PATH = join(GENOME_APP_DIRECTORY_PATH, 'output') MEDIA_DIRECTORY_PATH = join(GENOME_APP_DIRECTORY_PATH, 'media') REGION_FILE = join(INPUT_DIRECTORY_PATH, 'non_PAR_region.bed') VCF_FILE = join(PERSON_DIRECTORY_PATH, 'genome.vcf.gz') tbx = TabixFile(VCF_FILE) def get_format_index(): """ Get the index of the FORMAT field in VCF file Returns: int: index of FORMAT field """ try: for row in tbx.header: line = row.decode('UTF-8') if line.startswith('#CHROM') and 'FORMAT' in line: index = line.split('\t').index('FORMAT') return index except NameError:
def _fetch(self, region): if not self.has_tabix: raise Exception('Currently, tabix is required for region query') with TabixFile(self._gtf, parser=self._parser) as tabix: for row in tabix.fetch(region): yield row
contig = None if args.chrom in contigs: contig = contigs[args.chrom] elif 'chr%s' % args.chrom in contigs: contig = contigs['chr%s' % args.chrom] else: raise Exception('Trouble finding contig', args.chrom, 'in', contig_names) print('Chrom length', contig.length) vcf_files = [args.vcf_file] if args.additional_vcf_files is not None: vcf_files.extend(args.additional_vcf_files) if np.all([os.path.isfile(vcf_file + '.tbi') for vcf_file in vcf_files]): vcfs = [TabixFile(vcf_file, parser=None) for vcf_file in vcf_files] if args.batch_size != -1: start_pos, end_pos = args.batch_num * args.batch_size, ( args.batch_num + 1) * args.batch_size print('Interval', start_pos, end_pos) if start_pos < contig.length: process_body( itertools.chain(*[ vcf.fetch( reference=contig.name, start=start_pos, end=end_pos) for vcf in vcfs ]), sample_ids) else: print('Interval (%d-%d) is longer than chromosome (length=%d).' % (start_pos, end_pos, contig.length))
def pair_bins(query_bins, all_bins, outfile, max_dist, exclusion_list, excl_buffer, annotate_dist, sort_features, annotate_absdiff, maxfloat, bgzip, input_has_header=True): """ Create pairs of bins from input BED """ # Open connection to infiles & outfile if determine_filetype(query_bins) == 'compressed-bed': fin = gzip.open(query_bins, 'rt') else: fin = open(query_bins) if input_has_header: colnames = [ k.replace('#', '') for k in fin.readline().rstrip().split('\t') ] if all_bins is None: bins_tabix = TabixFile(bins) else: bins_tabix = TabixFile(all_bins) xbt = load_exclusion_bts(exclusion_list, excl_buffer) # Open connection to output file out_ftype, out_ext = determine_filetype(outfile, return_extension=True) if 'compressed' in out_ftype: outpath = outfile.replace(out_ext, 'bed') else: outpath = outfile fout = open(outpath, 'w') # Format header and write to outfile header = '#chr start end'.split() if annotate_dist: header.append('distance') for fname in colnames[3:]: if sort_features: fname_suffixes = ['min', 'max'] else: fname_suffixes = ['left', 'right'] if annotate_absdiff: fname_suffixes.append('absdiff') header += ['_'.join([fname, v]) for v in fname_suffixes] fout.write('\t'.join(header) + '\n') # Identify and curate all pairs for each bin in fin for query_line in fin.readlines(): query_vals = query_line.rstrip().split('\t') new_pairs = _get_pairs(fout, query_vals, bins_tabix, max_dist, xbt, annotate_dist, sort_features, annotate_absdiff, maxfloat) # Clean up fout.close() if bgzip: bgz(outpath)
def setup(cls, source): curr = cls(source) curr.fs = [TabixFile(i, parser=asBed()) for i in curr.source] return curr
def main(self, args): command.Command.main(self, args) self.validate(args) for i in [1, 2]: attr = "pop%d" % i pid, ary = getattr(args, attr) if len(ary) == 1 and ary[0][0] == "@": setattr(args, attr, SampleList( pid, open(ary[0][1:], "rt").read().strip().split("\n"))) pop_d = dict([args.pop1, args.pop2]) for pid in pop_d: if pop_d[pid]: c = Counter(pop_d[pid]) if max(c.values()) > 1: raise RuntimeError( "Population %s has duplicated samples: %s" % (pid, [item for item in c.items() if item[1] > 1])) dist = [[], []] if not args.d: first_sid = args.pop1.samples[0] args.d = [first_sid] * 2 args.d = [args.d[0] + ":0", args.d[1] + ":1"] all_samples = set(args.pop1.samples) | set(args.pop2.samples) for sid_i in args.d: sid, i = sid_i.split(":") i = int(i) if sid not in all_samples: raise RuntimeError("%s is not in the sample list" % sid) if sid in args.pop1.samples: d = dist[0] else: assert sid in args.pop2.samples d = dist[1] d.append((sid, i)) undist = [[(k, i) for k in p.samples for i in (0, 1) if (k, i) not in d] for p, d in zip((args.pop1, args.pop2), dist)] npop = 1 def print_pop(i): logger.info("Population %d:" % i) logger.info("Distinguished lineages: " + ", ".join("%s:%d" % t for t in dist[i - 1])) logger.info("Undistinguished lineages: " + ", ".join("%s:%d" % t for t in undist[i - 1])) print_pop(1) if args.pop2.pid is not None: npop = 2 common = set(args.pop1.samples) & set(args.pop2.samples) if common: logger.error("Populations 1 and 2 should be disjoint, " "but both contain " + ", ".join(common)) sys.exit(1) print_pop(2) # Start parsing vcf = VariantFile(args.vcf) with optional_gzip(args.out, "wt") as out: samples = list(vcf.header.samples) dist = dist[:npop] undist = undist[:npop] if not set([dd[0] for d in dist for dd in d]) <= set(samples): raise RuntimeError("Distinguished lineages not found in data?") missing = [s for u in undist for s, _ in u if s not in samples] if missing: msg = "The following samples were not found in the data: %s. " % ", ".join( missing) if args.ignore_missing: logger.warn(msg) else: msg += "If you want to continue without these samples, use --ignore-missing." raise RuntimeError(msg) undist = [[t for t in u if t[0] not in missing] for u in undist] # Write header pids = [a.pid for a in (args.pop1, args.pop2)[:npop]] out.write("# SMC++ ") json.dump({"version": version, "pids": pids, "undist": undist, "dist": dist}, out) out.write("\n") na = list(map(len, dist)) nb = list(map(len, undist)) # function to convert a VCF record to our format: # <span, dist gt, # undist gt, # undist, [...]> def rec2gt(rec): ref = rec.alleles[0] for di in dist: for d, i in di: if len(rec.samples[d].alleles) != 2: raise RuntimeError( "Expected a diploid genotype at position {} " "for individual {} but found:\n{}".format(rec.pos, d, list(rec.samples[d].alleles))) da = [[rec.samples[d].alleles[i] for d, i in di] for di in dist] a = [sum([x != ref for x in d]) if None not in d else -1 for d in da] bs = [[rec.samples[d].alleles[i] != ref for d, i in un if rec.samples[d].alleles[i] is not None] for un in undist] b = [sum(_) for _ in bs] nb = [len(_) for _ in bs] # Fold non-polymorphic (in subsample) sites if np.array_equal(b, nb) and np.array_equal(a, na): a = [0] * len(a) b = [0] * len(b) return list(sum(zip(a, b, nb), tuple())) try: region_iterator = vcf.fetch(contig=args.contig) except ValueError as e: logger.error("VCF reader threw an error: %s", e) logger.error("Make sure the VCF is indexed:") logger.error("") logger.error(" $ tabix %s", args.vcf) logger.error("") sys.exit(1) contig_length = args.length or vcf.header.contigs[args.contig].length if contig_length is None: logger.error("Failed to acquire contig length from VCF header. See the --length option.") sys.exit(1) if args.mask: mask_iterator = TabixFile( args.mask).fetch(reference=args.contig) args.missing_cutoff = np.inf else: mask_iterator = iter([]) if args.missing_cutoff is None: args.missing_cutoff = np.inf mask_iterator = (x.split("\t") for x in mask_iterator) mask_iterator = ((x[0], int(x[1]), int(x[2])) for x in mask_iterator) snps_only = ( rec for rec in region_iterator if len(rec.alleles) <= 2 and all(len(a) == 1 for a in rec.alleles) ) def interleaved(): cmask = next(mask_iterator, None) csnp = next(snps_only, None) while cmask or csnp: if cmask is None: yield "snp", csnp csnp = next(snps_only, None) elif csnp is None: yield "mask", cmask cmask = next(mask_iterator, None) else: if csnp.pos < cmask[1]: yield "snp", csnp csnp = next(snps_only, None) elif csnp.pos <= cmask[2]: while csnp is not None and csnp.pos <= cmask[2]: csnp = next(snps_only, None) yield "mask", cmask cmask = next(mask_iterator, None) else: yield "mask", cmask cmask = next(mask_iterator, None) abnb_miss = [-1, 0, 0] * len(nb) abnb_nonseg = sum([[0, 0, x] for x in nb], []) multiples = set() with RepeatingWriter(out) as rw, \ tqdm.tqdm(total=contig_length, unit='bases', unit_scale=True) as bar: def write(x): if not write.first or not args.drop_first_last: rw.write(x) write.first = False write.first = True last_pos = 0 for ty, rec in interleaved(): if ty == "mask": span = rec[1] - last_pos write([span] + abnb_nonseg) write([rec[2] - rec[1] + 1] + abnb_miss) last_pos = rec[2] continue bar.update(rec.pos - last_pos) abnb = rec2gt(rec) if rec.pos == last_pos: multiples.add(rec.pos) continue span = rec.pos - last_pos - 1 if 1 <= span <= args.missing_cutoff: write([span] + abnb_nonseg) elif span > args.missing_cutoff: write([span] + abnb_miss) write([1] + abnb) last_pos = rec.pos if not args.drop_first_last: write([contig_length - last_pos] + abnb_nonseg) if multiples: # FIXME: what to do with multiple records at same site logger.warn( "Multiple entries found at %d positions; skipped all but the first", len(multiples))
continue row['disease'] = disease[0:-1] snps[row['SNP']] = row with open('DrugInfo.csv') as src: drug_info = {row['SNP']: row for row in csv.DictReader(src)} with open('okg.ped') as pop_src: # mapping: sample id -> population id populations = { indiv['Individual ID']: indiv['Population'] for indiv in csv.DictReader(pop_src, delimiter='\t') } print 'Determining genomic coordinates for sequences.' f = TabixFile('snps.sorted.txt.gz', parser=asTuple()) snp_table = {} for row in f.fetch(): _, snp, chrom, pos = row if snp in snps or snp in drug_info: snp_table[snp] = {'chromosome': chrom, 'pos': int(pos)} with open('snps.py', 'w') as dump: dump.write(WARNING) dump.write('COORDINATES = %s\n' % snp_table) dump.write('DATA = %s\n' % snps) dump.write('DRUG_INFO = %s\n' % drug_info) print 'Data written to snps.py' print 'Determining allele frequencies (using data from 1000 Genomes)' genotypes = {snp: snp_data['Code'] for snp, snp_data in snps.iteritems()} variants = list( ga4gh.search_variants(genotypes, dataset=ga4gh.OKG, repo_id='google'))
from pysam import VariantFile from pysam import TabixFile from pyfaidx import Fasta # data files reference_file = 'S_lycopersicum_chromosomes.2.40.fa' annotation_file = 'gene_models.gff.gz' variant_file = 'tomato_snps.bcf' # load reference reference = Fasta(reference_file) # load annotations annotations = TabixFile(annotation_file) # laod variants variants = VariantFile(variant_file) # regions to query region1 = ("SL2.40ch01", 15000, 21000) region2 = ("SL2.40ch01", 20000, 70000) region1_reference = reference[region1[0]][region1[1]: region1[2]] region1_annotations = [a for a in annotations.fetch(*region1, parser=pysam.asGTF())] region1_variants = [a for a in variants.fetch(*region1)] region2_reference = reference[region2[0]][region2[1]: region2[2]] region2_annotations = [a for a in annotations.fetch(*region2, parser=pysam.asGTF())] region2_variants = [a for a in variants.fetch(*region2)]
class IndexedBedFile(DataSource): name = "indexed_bedfile" version = "0.1.0" container = "dataframe" partition_access = False description = "A bgzipped and indexed bedfile" def __init__(self, urlpath, include_unmapped=True, metadata=None): self._urlpath = urlpath self._include_unmapped = include_unmapped self._dataset = None self._dtype = None self._chroms = None super(IndexedBedFile, self).__init__(metadata=metadata) def _open_dataset(self): self._dataset = TabixFile(self._urlpath) def _get_schema(self): if self._dataset is None: self._open_dataset() self._chroms = list(self._dataset.contigs) rec = next(self._dataset.fetch(self._chroms[0], parser=asTuple())) num_fields = len(rec) chrom_coord_dtype = np.int64 dtypes = { "chrom": pd.CategorialDtype(self._chroms + ["NULL"], ordered=True), "start": chrom_coord_dtype, "end": chrom_coord_dtype, "name": str, "score": np.float32, "strand": bool, } self._dtype = { key: dtypes[key] for key in list(dtypes.keys())[:num_fields] } return Schema( datashape=None, dtype=self._dtype, shape=(None, len(self._dtype)), npartitions=len(self._chroms), extra_metadata={}, ) def _get_partition(self, i): chrom = self._chroms[i] columns = list(self._dtype.keys()) return pd.DataFrame(list(self._dataset.fetch(chrom, parser=asTuple())), columns=columns).astype(self._dtype) def read(self): self._load_metadata() return pd.concat( [self.read_partition(i) for i in range(self.npartitions)], ignore_index=True) def _close(self): # close any files, sockets, etc if self._dataset is not None: self._dataset.close()
if row['Chromosome'] is None: disease = row['SNP'] continue row['disease'] = disease[0:-1] snps[row['SNP']] = row with open('DrugInfo.csv') as src: drug_info = {row['SNP']: row for row in csv.DictReader(src)} with open('okg.ped') as pop_src: # mapping: sample id -> population id populations = {indiv['Individual ID']: indiv['Population'] for indiv in csv.DictReader(pop_src, delimiter='\t')} print 'Determining genomic coordinates for sequences.' f = TabixFile('snps.sorted.txt.gz', parser=asTuple()) snp_table = {} for row in f.fetch(): _, snp, chrom, pos = row if snp in snps or snp in drug_info: snp_table[snp] = { 'chromosome': chrom, 'pos': int(pos) } with open('snps.py', 'w') as dump: dump.write(WARNING) dump.write('COORDINATES = %s\n'% snp_table) dump.write('DATA = %s\n'% snps) dump.write('DRUG_INFO = %s\n'% drug_info) print 'Data written to snps.py' print 'Determining allele frequencies (using data from 1000 Genomes)'
def setup(cls, source): curr = cls(source) curr.f = TabixFile(curr.source, parser=asBed()) return curr