def test_ambiguous_mask2(tmpdir): # only ambigous regions are present bed_file = write_tmp( 'chr1\t1\t2\t1\t0\nchr2\t1\t3\t0\t1\nchr3\t1\t3\t-1\t-1', tmpdir) bt = BedDataset(bed_file, ambiguous_mask=-1) assert len(bt) == 2 assert np.all(bt.get_targets().max(axis=1) >= 0)
def __init__(self, intervals_file, fasta_file, bigwigs, track_width=2000, incl_chromosomes=None, excl_chromosomes=None, num_chr_fasta=False): self.num_chr_fasta = num_chr_fasta self.intervals_file = intervals_file self.fasta_file = fasta_file self.bigwigs = bigwigs self.incl_chromosomes = incl_chromosomes self.excl_chromosomes = excl_chromosomes self.track_width = track_width self.tsv = BedDataset( self.intervals_file, num_chr=self.num_chr_fasta, bed_columns=4, ignore_targets=True, incl_chromosomes=incl_chromosomes, excl_chromosomes=excl_chromosomes, ) self.fasta_extractor = None self.bigwig_extractors = None
def test_more_columns(tmpdir): bed_file = write_tmp( 'chr1\t1\t2\tinterval1\t1\t0\nchr2\t1\t3\tinterval2\t0\t1', tmpdir) with pytest.raises(Exception): bt = BedDataset(bed_file, label_dtype=bool) bt = BedDataset(bed_file, bed_columns=4, label_dtype=bool) assert bt[0][0].name == 'interval1' assert bt[1][0].name == 'interval2' with pytest.raises(Exception): bt = BedDataset(bed_file)
def test_ambiguous_mask(tmpdir): bed_file = write_tmp( 'chr1\t1\t2\t1\t0\nchr2\t1\t3\t0\t1\nchr3\t1\t3\t0\t-1', tmpdir) bt = BedDataset(bed_file) assert len(bt) == 3 assert np.all(bt[2][1] == np.array([0, -1])) # same as before bt = BedDataset(bed_file, ambiguous_mask=-1) assert len(bt) == 3 assert np.all(bt[2][1] == np.array([0, -1])) assert np.all(bt.get_targets().max(axis=1) >= 0)
def test_incl_excl_chromosomes(tmpdir): bed_file = write_tmp( 'chr1\t1\t2\t1\t0\nchr2\t1\t3\t0\t1\nchr3\t1\t3\t0\t1', tmpdir) bt = BedDataset(bed_file) assert len(bt) == 3 bt = BedDataset(bed_file, incl_chromosomes=['chr1']) assert len(bt) == 1 assert bt[0][0] == Interval("chr1", 1, 2) bt = BedDataset(bed_file, excl_chromosomes=['chr1']) assert len(bt) == 2 assert bt[0][0] == Interval("chr2", 1, 3)
def test_bed3_labels(tmpdir): bed_file = write_tmp('chr1\t1\t2\t1\t0\nchr1\t1\t3\t0\t1', tmpdir) bt = BedDataset(bed_file) assert np.all(bt.get_targets() == np.array([[1, 0], [0, 1]])) assert len(bt) == 2 assert bt.n_tasks == 2 assert np.all(bt.df[0] == 'chr1') assert bt[0][0] == Interval("chr1", 1, 2) assert np.all(bt[0][1] == np.array([1, 0])) assert bt[1][0] == Interval("chr1", 1, 3) assert np.all(bt[1][1] == np.array([0, 1])) assert len(bt) == 2
def __init__(self, intervals_file, fasta_file, ignore_targets=True): self.bt = BedDataset(intervals_file, bed_columns=3, ignore_targets=ignore_targets) self.fasta_file = fasta_file self.fasta_extractor = None self.transform = OneHot() # one-hot encode DNA sequence
def test_bed3(tmpdir): bed_file = write_tmp('chr1\t1\t2\nchr1\t1\t3', tmpdir) bt = BedDataset(bed_file) assert bt.n_tasks == 0 assert len(bt) == 2 assert np.all(bt.df[0] == 'chr1') assert bt[0] == (Interval("chr1", 1, 2), {}) assert bt[1] == (Interval("chr1", 1, 3), {})
def test_tsvreader(tsv_file, num_chr, label_dtype): ds = BedDataset(tsv_file, label_dtype=label_dtype, num_chr=num_chr) interval, labels = ds[0] assert isinstance(interval, Interval) if not num_chr: assert interval.chrom.startswith("chr") assert isinstance(labels[0], label_dtype) assert interval.start == 2 assert interval.end == 4
def test_label_dtype(tmpdir): bed_file = write_tmp('chr1\t1\t2\t1\t0\nchr2\t1\t3\t0\t1', tmpdir) bt = BedDataset(bed_file, label_dtype=bool) assert len(bt) == 2 assert bt[0][1].dtype == bool assert bt.get_targets().dtype == bool
def test_num_chr(tmpdir): bed_file = write_tmp( 'chr1\t1\t2\t1\t0\nchr2\t1\t3\t0\t1\nchr3\t1\t3\t0\t-1', tmpdir) bt = BedDataset(bed_file, num_chr=True) assert len(bt) == 3 assert bt[0][0].chrom == '1'
class ActivityDataset(Dataset): """ Args: intervals_file: bed4 file containing chrom start end name fasta_file: file path; Genome sequence label_dtype: label data type num_chr_fasta: if True, the tsv-loader will make sure that the chromosomes don't start with chr """ def __init__(self, intervals_file, fasta_file, bigwigs, track_width=2000, incl_chromosomes=None, excl_chromosomes=None, num_chr_fasta=False): self.num_chr_fasta = num_chr_fasta self.intervals_file = intervals_file self.fasta_file = fasta_file self.bigwigs = bigwigs self.incl_chromosomes = incl_chromosomes self.excl_chromosomes = excl_chromosomes self.track_width = track_width self.tsv = BedDataset( self.intervals_file, num_chr=self.num_chr_fasta, bed_columns=4, ignore_targets=True, incl_chromosomes=incl_chromosomes, excl_chromosomes=excl_chromosomes, ) self.fasta_extractor = None self.bigwig_extractors = None def __len__(self): return len(self.tsv) def __getitem__(self, idx): if self.fasta_extractor is None: self.fasta_extractor = FastaExtractor(self.fasta_file) self.bigwig_extractors = { a: [BigwigExtractor(f) for f in self.bigwigs[a]] for a in self.bigwigs } interval, labels = self.tsv[idx] interval = resize_interval(interval, 1000) # Intervals need to be 1000bp wide assert interval.stop - interval.start == 1000 # Run the fasta extractor seq = np.squeeze(self.fasta_extractor([interval])) interval_wide = resize_interval(deepcopy(interval), self.track_width) return { "inputs": { "seq": seq }, "targets": { a: sum([e([interval_wide])[0] for e in self.bigwig_extractors[a]]).sum() for a in self.bigwig_extractors }, "metadata": { "ranges": GenomicRanges(interval.chrom, interval.start, interval.stop, str(idx)), "ranges_wide": GenomicRanges.from_interval(interval_wide), "name": interval.name } } def get_targets(self): return self.tsv.get_targets()
def __init__(self, intervals_file, fasta_file, vcf_file, chr_order_file, vcf_file_tbi=None, strand_column=6, id_column=4, num_chr=True): # workaround for test if vcf_file_tbi is not None and vcf_file_tbi.endswith("vcf_file_tbi"): os.rename(vcf_file_tbi, vcf_file_tbi.replace("vcf_file_tbi", "vcf_file.tbi")) self.num_chr_fasta = num_chr self.intervals_file = intervals_file self.fasta_file = fasta_file self.vcf_file = vcf_file self.chr_order_file = chr_order_file self.strand_column = strand_column - 1 self.id_column = id_column - 1 self.force_upper = True # "Parse" bed file self.bed = BedDataset(self.intervals_file, num_chr=self.num_chr_fasta, bed_columns=3, label_dtype=str, ignore_targets=False) # Intersect bed and vcf using bedtools # bedtools c flag: for each bed interval, counts number of vcf entries it overlaps bed_tool = pybedtools.BedTool(self.intervals_file) intersect_counts = list( bed_tool.intersect(self.vcf_file, c=True, sorted=True, g=self.chr_order_file)) intersect_counts = np.array( [isect.count for isect in intersect_counts]) # Retain only those transcripts that intersect a variant utr5_bed = self.bed.df id_col = utr5_bed.iloc[:, self.id_column] retain_transcripts = utr5_bed[ intersect_counts > 0].iloc[:, self.id_column] utr5_bed = utr5_bed[utr5_bed.iloc[:, self.id_column].isin( retain_transcripts)] # Aggregate 5utr positions per transcript tuples = list(zip(utr5_bed.iloc[:, 1], utr5_bed.iloc[:, 2])) pos = [[x] for x in tuples] id_chr_strand = list( zip(utr5_bed.iloc[:, self.id_column], utr5_bed.iloc[:, 0], utr5_bed.iloc[:, self.strand_column])) utr5_bed_posaggreg = pd.DataFrame({ "pos": pos, "id_chr_strand": id_chr_strand }) utr5_bed_posaggreg = utr5_bed_posaggreg.groupby("id_chr_strand").agg( {'pos': 'sum'}) # Rebuild "bed" utr5_bed_posaggreg["id"] = [x[0] for x in utr5_bed_posaggreg.index] utr5_bed_posaggreg["chr"] = [x[1] for x in utr5_bed_posaggreg.index] utr5_bed_posaggreg["strand"] = [x[2] for x in utr5_bed_posaggreg.index] self.bed = utr5_bed_posaggreg.reset_index()[[ "id", "chr", "pos", "strand" ]] self.fasta_extractor = None self.vcf = None self.vcf_extractor = None