Exemple #1
0
def extractor(intervals_file,
              input_data_sources,
              target_data_sources=None,
              batch_size=128):
    """BatchGenerator

    Args:
        intervals_file: tsv file
            Assumes bed-like `chrom start end id` format.
        input_data_sources: dict
            mapping from input name to genomelake directory
        target_data_sources: dict, optional
            mapping from input name to genomelake directory
        batch_size: int
    """
    bt = pybedtools.BedTool(intervals_file)
    input_data_extractors = {
        key: ArrayExtractor(data_source)
        for key, data_source in input_data_sources.items()
    }
    if target_data_sources is not None:
        target_data_extractors = {
            key: ArrayExtractor(data_source)
            for key, data_source in target_data_sources.items()
        }
    intervals_generator = batch_iter(bt, batch_size)
    for intervals_batch in intervals_generator:
        out = {}
        # get data
        out['inputs'] = {
            key:
            extractor(intervals_batch)[...,
                                       None]  # adds channel axis for conv1d
            for key, extractor in input_data_extractors.items()
        }
        if target_data_sources is not None:
            out['targets'] = {
                key: extractor(intervals_batch)[
                    ..., None]  # adds channel axis for conv1d
                for key, extractor in target_data_extractors.items()
            }
        # get metadata
        out['metadata'] = {}
        chrom = []
        start = []
        end = []
        ids = []
        for interval in intervals_batch:
            chrom.append(interval.chrom)
            start.append(interval.start)
            end.append(interval.stop)
            ids.append(interval.name)

        out['metadata']['ranges'] = {}
        out['metadata']['ranges']['chr'] = np.array(chrom)
        out['metadata']['ranges']['start'] = np.array(start)
        out['metadata']['ranges']['end'] = np.array(end)
        out['metadata']['ranges']['id'] = np.array(ids)

        yield out
def test_array_extractor_fasta(mode, in_memory):
    data_dir = 'tests/data/fasta_test_dir_{}_{}'.format(mode, in_memory)
    backend.extract_fasta_to_file(
        'tests/data/fasta_test.fa',
        data_dir,
        mode=mode,
        overwrite=True)
    extractor = ArrayExtractor(data_dir, in_memory=in_memory)
    intervals = [Interval('chr1', 0, 10),
                 Interval('chr2', 0, 10)]
    expected_data = np.array(
        [[[ 1.  ,  0.  ,  0.  ,  0.  ],
          [ 0.  ,  1.  ,  0.  ,  0.  ],
          [ 0.  ,  1.  ,  0.  ,  0.  ],
          [ 0.  ,  0.  ,  1.  ,  0.  ],
          [ 0.  ,  0.  ,  0.  ,  1.  ],
          [ 1.  ,  0.  ,  0.  ,  0.  ],
          [ 0.  ,  1.  ,  0.  ,  0.  ],
          [ 0.  ,  1.  ,  0.  ,  0.  ],
          [ 0.  ,  0.  ,  1.  ,  0.  ],
          [ 0.  ,  0.  ,  0.  ,  1.  ]],

         [[ 1.  ,  0.  ,  0.  ,  0.  ],
          [ 0.  ,  1.  ,  0.  ,  0.  ],
          [ 0.  ,  0.  ,  1.  ,  0.  ],
          [ 0.  ,  0.  ,  0.  ,  1.  ],
          [ 0.25,  0.25,  0.25,  0.25],
          [ 1.  ,  0.  ,  0.  ,  0.  ],
          [ 0.  ,  1.  ,  0.  ,  0.  ],
          [ 0.  ,  0.  ,  1.  ,  0.  ],
          [ 0.  ,  0.  ,  0.  ,  1.  ],
          [ 0.25,  0.25,  0.25,  0.25]]], dtype=np.float32)
    data = extractor(intervals)
    assert (data == expected_data).all()
def test_array_extractor_bigwig(test_bigwig_and_intervals, mode, in_memory):
    bw_path, intervals, expected_data = test_bigwig_and_intervals
    bw_dir_path = "{}.dir".format(bw_path)
    backend.extract_bigwig_to_file(
        bw_path, bw_dir_path, mode=mode, overwrite=True)
    extractor = ArrayExtractor(bw_dir_path, in_memory=in_memory)

    data = extractor(intervals)
    assert (data == expected_data).all()
Exemple #4
0
# retrieve data
data = Data_Directories()
print data.intervals.keys()
print data.input_atac['day0'].keys()
print data.output_histone['day0'].keys()

# In[4]:

# get intervals for day0 data
day0_intervals = list(BedTool(data.intervals['day0']))
print '# of Intervals Extracted for day0: {}'.format(len(day0_intervals))

# In[5]:

# create an ArrayExtractor for ATAC-seq for day0 with 140 base pairs
bw_140bp_day0 = ArrayExtractor(data.input_atac['day0']['140'])
print 'Finished extracting bigwig for day0, 140bp'

# In[6]:

# create a BigWigExtractor for histone makr 'H3K27ac' for day0
bw_histone_mark_day0 = BigwigExtractor(data.output_histone['day0']['H3K27ac'])
print 'Finished extracting bigwig for day0, 140bp'

# In[7]:

# normalize day0 intervals
normalized_day0_intervals = [
    normalize_interval(interval, window_size) for interval in day0_intervals
    if normalize_interval(interval, window_size)
]
Exemple #5
0
train_intervals = list(BedTool(train_dir))
val_intervals = list(BedTool(val_dir))
test_intervals = list(BedTool(test_dir))
print '# of Train Intervals: {}'.format(len(train_intervals))
print '# of Val Intervals: {}'.format(len(val_intervals))
print '# of Test Intervals: {}'.format(len(test_intervals))

# Get input/output data directories
data = Data_Directories()
print data.intervals.keys()
print data.input_atac[day].keys()
print data.output_histone[day].keys()

# Extract input candidates
# Create an ArrayExtractor for ATAC-seq of a given day and specified fragment length
input_candidates = ArrayExtractor(data.input_atac[day][frag])
print 'Finished extracting bigwig for {}, {}bp'.format(day, frag)

# Extract output candiates
# Create a BigWigExtractor for histone mark of a given day
output_candidates = BigwigExtractor(data.output_histone[day][histone])
print 'Finished extracting bigwig for {}, {}'.format(day, histone)

# Normalize train intervals
normalized_train_intervals = [normalize_interval(interval, window_size) for interval in train_intervals if normalize_interval(interval, window_size)]
print 'Finished normalizing train intervals!'
# Normalize val intervals
normalized_val_intervals = [normalize_interval(interval, window_size) for interval in val_intervals if normalize_interval(interval, window_size)]
print 'Finished normalizing val intervals!'
# Normalize test intervals
normalized_test_intervals = [normalize_interval(interval, window_size) for interval in test_intervals if normalize_interval(interval, window_size)]
Exemple #6
0
    def __getitem__(self, idx):
        if self.fasta_extractor is None:
            # Use array extractors
            if self.bcolz:
                self.fasta_extractor = ArrayExtractor(self.ds.fasta_file,
                                                      in_memory=False)
                self.bw_extractors = {
                    task: [
                        ArrayExtractor(task_spec.pos_counts, in_memory=False),
                        ArrayExtractor(task_spec.neg_counts, in_memory=False)
                    ]
                    for task, task_spec in self.ds.task_specs.items()
                    if task in self.tasks
                }
                self.bias_bw_extractors = {
                    task: [
                        ArrayExtractor(task_spec.pos_counts, in_memory=False),
                        ArrayExtractor(task_spec.neg_counts, in_memory=False)
                    ]
                    for task, task_spec in self.ds.bias_specs.items()
                    if task in self.tasks
                }
            else:
                # Use normal fasta/bigwig extractors
                assert not self.bcolz
                # first call
                self.fasta_extractor = FastaExtractor(self.ds.fasta_file,
                                                      use_strand=True)
                self.bw_extractors = {
                    task: [
                        BigwigExtractor(task_spec.pos_counts),
                        BigwigExtractor(task_spec.neg_counts)
                    ]
                    for task, task_spec in self.ds.task_specs.items()
                    if task in self.tasks
                }
                self.bias_bw_extractors = {
                    task: [
                        BigwigExtractor(task_spec.pos_counts),
                        BigwigExtractor(task_spec.neg_counts)
                    ]
                    for task, task_spec in self.ds.bias_specs.items()
                }

        # Setup the intervals
        interval = Interval(
            self.dfm.iat[idx, 0],  # chrom
            self.dfm.iat[idx, 1],  # start
            self.dfm.iat[idx, 2])  # end

        # Transform the input interval (for say augmentation...)
        if self.interval_transformer is not None:
            interval = self.interval_transformer(interval)

        target_interval = resize_interval(deepcopy(interval), self.peak_width)
        seq_interval = resize_interval(deepcopy(interval), self.seq_width)

        # This only kicks in when we specify the taskname from dataspec
        # to the 3rd column. E.g. it doesn't apply when using intervals_file
        interval_from_task = self.dfm.iat[
            idx, 3] if self.intervals_file is None else ''

        # extract seq + tracks
        sequence = self.fasta_extractor([seq_interval])[0]

        if not self.only_classes:
            if self.taskname_first:
                cuts = {
                    f"{task}/profile":
                    run_extractors(self.bw_extractors[task], [target_interval],
                                   ignore_strand=spec.ignore_strand)[0]
                    for task, spec in self.ds.task_specs.items()
                    if task in self.tasks
                }
            else:
                cuts = {
                    f"profile/{task}":
                    run_extractors(self.bw_extractors[task], [target_interval],
                                   ignore_strand=spec.ignore_strand)[0]
                    for task, spec in self.ds.task_specs.items()
                    if task in self.tasks
                }

            # Add counts
            if self.target_transformer is not None:
                cuts = self.target_transformer.transform(cuts)

            # Add bias tracks
            if len(self.ds.bias_specs) > 0:

                biases = {
                    bias_task:
                    run_extractors(self.bias_bw_extractors[bias_task],
                                   [target_interval],
                                   ignore_strand=spec.ignore_strand)[0]
                    for bias_task, spec in self.ds.bias_specs.items()
                }

                task_biases = {
                    f"bias/{task}/profile": np.concatenate(
                        [biases[bt] for bt in self.task_bias_tracks[task]],
                        axis=-1)
                    for task in self.tasks
                }

                if self.target_transformer is not None:
                    for task in self.tasks:
                        task_biases[f'bias/{task}/counts'] = np.log(
                            1 + task_biases[f'bias/{task}/profile'].sum(0))
                    # total_count_bias = np.concatenate([np.log(1 + x[k].sum(0))
                    #                                    for k, x in biases.items()], axis=-1)
                    # task_biases['bias/total_counts'] = total_count_bias

                if self.profile_bias_pool_size is not None:
                    for task in self.tasks:
                        task_biases[f'bias/{task}/profile'] = np.concatenate(
                            [
                                moving_average(
                                    task_biases[f'bias/{task}/profile'],
                                    n=pool_size) for pool_size in to_list(
                                        self.profile_bias_pool_size)
                            ],
                            axis=-1)

                sequence = {"seq": sequence, **task_biases}
        else:
            cuts = dict()

        if self.include_classes:
            if self.taskname_first:
                # Get the classes from the tsv file
                classes = {
                    f"{task}/class": self.dfm.iat[idx, i + 3]
                    for i, task in enumerate(self.dfm_tasks)
                    if task in self.tasks
                }
            else:
                classes = {
                    f"class/{task}": self.dfm.iat[idx, i + 3]
                    for i, task in enumerate(self.dfm_tasks)
                    if task in self.tasks
                }
            cuts = {**cuts, **classes}

        out = {"inputs": sequence, "targets": cuts}

        if self.include_metadata:
            out['metadata'] = {
                "range":
                GenomicRanges(
                    chr=target_interval.chrom,
                    start=target_interval.start,
                    end=target_interval.stop,
                    id=idx,
                    strand=(target_interval.strand
                            if target_interval.strand is not None else "*"),
                ),
                "interval_from_task":
                interval_from_task
            }
        return out
Exemple #7
0
    def __init__(self,
                 ds,
                 peak_width=200,
                 seq_width=None,
                 incl_chromosomes=None,
                 excl_chromosomes=None,
                 intervals_file=None,
                 bcolz=False,
                 in_memory=False,
                 include_metadata=True,
                 taskname_first=False,
                 tasks=None,
                 include_classes=False,
                 only_classes=False,
                 shuffle=True,
                 interval_transformer=None,
                 target_transformer=None,
                 profile_bias_pool_size=None):
        """Dataset for loading the bigwigs and fastas

        Args:
          ds (basepair.src.schemas.DataSpec): data specification containing the
            fasta file, bed files and bigWig file paths
          chromosomes (list of str): a list of chor
          peak_width: resize the bed file to a certain width
          intervals_file: if specified, use these regions to train the model.
            If not specified, the regions are inferred from the dataspec.
          only_classes: if True, load only classes
          bcolz: If True, the bigwig/fasta files are in the genomelake bcolz format
          in_memory: If True, load the whole bcolz into memory. Only applicable when bcolz=True
          shuffle: True
          preprocessor: trained preprocessor object containing the .transform methods
        """
        if isinstance(ds, str):
            self.ds = DataSpec.load(ds)
        else:
            self.ds = ds
        self.peak_width = peak_width
        if seq_width is None:
            self.seq_width = peak_width
        else:
            self.seq_width = seq_width
        self.shuffle = shuffle
        self.intervals_file = intervals_file
        self.incl_chromosomes = incl_chromosomes
        self.excl_chromosomes = excl_chromosomes
        self.target_transformer = target_transformer
        self.include_classes = include_classes
        self.only_classes = only_classes
        self.taskname_first = taskname_first
        if self.only_classes:
            assert self.include_classes
        self.profile_bias_pool_size = profile_bias_pool_size
        # not specified yet
        self.fasta_extractor = None
        self.bw_extractors = None
        self.bias_bw_extractors = None
        self.include_metadata = include_metadata
        self.interval_transformer = interval_transformer
        self.bcolz = bcolz
        self.in_memory = in_memory
        if not self.bcolz and self.in_memory:
            raise ValueError(
                "in_memory option only applicable when bcolz=True")

        # Load chromosome lengths
        if self.bcolz:
            p = json.loads(
                (Path(self.ds.fasta_file) / "metadata.json").read_text())
            self.chrom_lens = {c: v[0] for c, v in p['file_shapes'].items()}
        else:
            fa = FastaFile(self.ds.fasta_file)
            self.chrom_lens = {
                name: l
                for name, l in zip(fa.references, fa.lengths)
            }
            if len(self.chrom_lens) == 0:
                raise ValueError(
                    f"no chromosomes found in fasta file: {self.ds.fasta_file}. "
                    "Make sure the file path is correct and that the fasta index file {self.ds.fasta_file}.fai is up to date"
                )
            del fa

        if self.intervals_file is None:
            self.dfm = load_beds(bed_files={
                task: task_spec.peaks
                for task, task_spec in self.ds.task_specs.items()
                if task_spec.peaks is not None
            },
                                 chromosome_lens=self.chrom_lens,
                                 excl_chromosomes=self.excl_chromosomes,
                                 incl_chromosomes=self.incl_chromosomes,
                                 resize_width=max(self.peak_width,
                                                  self.seq_width))
            assert list(
                self.dfm.columns)[:4] == ["chrom", "start", "end", "task"]
            if self.shuffle:
                self.dfm = self.dfm.sample(frac=1)
            self.tsv = None
            self.dfm_tasks = None
        else:
            self.tsv = TsvReader(self.intervals_file,
                                 num_chr=False,
                                 label_dtype=int,
                                 mask_ambigous=-1,
                                 incl_chromosomes=incl_chromosomes,
                                 excl_chromosomes=excl_chromosomes,
                                 chromosome_lens=self.chrom_lens,
                                 resize_width=max(self.peak_width,
                                                  self.seq_width))
            if self.shuffle:
                self.tsv.shuffle_inplace()
            self.dfm = self.tsv.df  # use the data-frame from tsv
            self.dfm_tasks = self.tsv.get_target_names()

        # remember the tasks
        if tasks is None:
            self.tasks = list(self.ds.task_specs)
        else:
            self.tasks = tasks

        if self.bcolz and self.in_memory:
            self.fasta_extractor = ArrayExtractor(self.ds.fasta_file,
                                                  in_memory=True)
            self.bw_extractors = {
                task: [
                    ArrayExtractor(task_spec.pos_counts, in_memory=True),
                    ArrayExtractor(task_spec.neg_counts, in_memory=True)
                ]
                for task, task_spec in self.ds.task_specs.items()
                if task in self.tasks
            }
            self.bias_bw_extractors = {
                task: [
                    ArrayExtractor(task_spec.pos_counts, in_memory=True),
                    ArrayExtractor(task_spec.neg_counts, in_memory=True)
                ]
                for task, task_spec in self.ds.bias_specs.items()
                if task in self.tasks
            }

        if self.include_classes:
            assert self.dfm_tasks is not None

        if self.dfm_tasks is not None:
            assert set(self.tasks).issubset(self.dfm_tasks)

        # setup bias maps per task
        self.task_bias_tracks = {
            task: [
                bias for bias, spec in self.ds.bias_specs.items()
                if task in spec.tasks
            ]
            for task in self.tasks
        }
Exemple #8
0
def extractor(intervals_file,
              input_data_sources,
              target_data_sources=None,
              batch_size=128):
    """BatchGenerator

    Args:
        intervals_file: tsv file
            Assumes bed-like `chrom start end id` format.
        input_data_sources: dict
            mapping from input name to genomelake directory
        target_data_sources: dict, optional
            mapping from input name to genomelake directory
        batch_size: int
    """
    if not isinstance(input_data_sources, dict):
        import zipfile
        if zipfile.is_zipfile(input_data_sources):
            input_data_sources = inflate_data_sources(input_data_sources)
        else:
            raise Exception(
                "input_data_sources has to be a python direct or the path to a zipped directory!"
            )
    bt = pybedtools.BedTool(intervals_file)
    input_data_extractors = {
        key: ArrayExtractor(data_source)
        for key, data_source in input_data_sources.items()
    }
    if target_data_sources is not None:
        target_data_extractors = {
            key: ArrayExtractor(data_source)
            for key, data_source in target_data_sources.items()
        }
    intervals_generator = batch_iter(bt, batch_size)
    for intervals_batch in intervals_generator:
        out = {}
        # get data
        out['inputs'] = {
            key:
            extractor(intervals_batch)[...,
                                       None]  # adds channel axis for conv1d
            for key, extractor in input_data_extractors.items()
        }
        if target_data_sources is not None:
            out['targets'] = {
                key: extractor(intervals_batch)[
                    ..., None]  # adds channel axis for conv1d
                for key, extractor in target_data_extractors.items()
            }
        # get metadata
        out['metadata'] = {}
        chrom = []
        start = []
        end = []
        ids = []
        for interval in intervals_batch:
            chrom.append(interval.chrom)
            start.append(interval.start)
            end.append(interval.stop)
            ids.append(interval.name)

        out['metadata'] = {
            'ranges':
            GenomicRanges(chr=np.array(chrom),
                          start=np.array(start),
                          end=np.array(end),
                          id=np.array(id))
        }

        yield out