Esempio n. 1
0
def convert_region(region, pflank):
    top = (region.end + region.start) // 2
    return Interval(region.chrom, top - pflank, top + pflank + 1, region.name,
                    region.score, region.strand)
Esempio n. 2
0
                    default=0,
                    type=int,
                    help="Score threshold for the binding peaks")
args = parser.parse_args()

trackopts = "track name=\"%s\" description=\"%s\" visibility=1 color=0,60,120 useScore=1" % (
    args.name, args.description)
print(trackopts)

regions = BedTool(args.path)
regions = BedTool([x for x in regions if float(x.score) > args.minscore])

score_range = [166, 277, 388, 499, 611, 722, 833, 945, 1000]
percentile_range = list(np.linspace(0, 100, len(score_range) + 1))
scores = [float(x.score) for x in regions]
thresholds = np.percentile(scores, percentile_range)
thresholds[-1] *= 1.01

updated_regions = []
for region in regions:
    for ucsc_score, t1, t2 in zip(score_range, thresholds, thresholds[1:]):
        if (t1 <= float(region.score) < t2):
            updated_regions.append(
                Interval('chr1', region.start, region.end, region.name,
                         str(ucsc_score - 2), region.strand))

#print([float(x.score) for x in updated_regions])

for interval in updated_regions:
    sys.stdout.write(str(interval))
Esempio n. 3
0
             ylabel,
             va='center',
             rotation='vertical',
             fontsize='xx-large')
    plt.savefig(path, format=args.format)
    #plt.show()
    plt.close()


##########################################################################################################################

#regions = BedTool([Interval(x.chrom, max(0, x.start - args.flen), x.end + args.flen, name=x.name, score=x.attrs['zscores'] , strand=x.strand) for x in regions])
regions = BedTool([
    Interval(x.chrom,
             max(0, x.start - args.flen),
             x.end + args.flen,
             name=x.name,
             score=x.attrs['zscores'],
             strand=x.strand) for x in regions
])

annotation = BedTool(args.annotation)
if (not args.custom):
    annotation = [x for x in annotation if x[2] in ['gene', 'pseudogene']]

rawannotated = regions.intersect(annotation, wo=True)
annotated = defaultdict(list)


def get_annotation(intersection, offset):
    return max(intersection.start, int(intersection[offset + 3])), min(
        intersection.end, int(intersection[offset + 4])), dict([
Esempio n. 4
0
            current_drop.append(int(a[1]))
            gcdrops.append(tuple(current_drop))
            current_drop = []
        #if(a[3] == 'min'):
        #minima[int(a[1])] = float(a[2]);
        #else:
        #maxima[int(a[1])] = float(a[2]);

gcdrops_intervals = []
for c, (start, end) in enumerate(gcdrops, start=1):
    score = min(gc_content[start:end])
    if (score <= 0.35):
        gcdrops_intervals.append(
            Interval('chr1',
                     start,
                     end,
                     name="drop%d" % c,
                     score="%1.5f" % score,
                     strand='+'))

gcdrops_intervals = BedTool(gcdrops_intervals)
print(len(gcdrops_intervals))

gcdrops_regions = gcdrops_intervals.intersect(regions, c=True)
#for interval in gcdrops_regions:
#print(interval)

#gcmeans = [np.mean(x) for x in sliding_window(gc_content, 20)]
#print(heapq.nsmallest(20, gcmeans))

#print(heapq.nsmallest(20, minima))
Esempio n. 5
0
def consensus(
    bed: Iterable[BedTool],
    weights: [int],
    threshold: float,
    merge: Callable[[Iterable[Interval], Interval], Interval] = _default_merge
) -> BedTool:
    """
    :param bed: Original bed files
    :param weights: vote's weights, pass ones if you are not sure
    :param threshold: How many votes do we need to consider region a conservative one?
    :param merge: Function to transfer meta information from voted intervals -> new interval
    :return:
    """
    # 1. Create intervals that have the same number of full hits with respect to the original bed files.
    # 2. Threshold regions by the number of hits
    # |▓▓▓| |▓▓▓▓▓▓▓▓▓|
    # |▓▓|    |▓▓▓▓▓▓▓▓▓|
    # |▓|         |▓▓▓▓▓▓▓▓▓|
    # -----------------------
    # |▓|▓| |▓▓|▓▓|▓▓▓|▓▓|▓▓|
    # |3|2| |1 |2 |3  |2 |1 |
    result = []

    # In parallel loop over bed intervals
    # At each step select subinterval and count hits

    iterators = [iter(b.sort()) for b in bed]
    assert len(weights) == len(iterators), "len(regions) != len(weights)"
    intervals = [(ind, next(b_iter)) for ind, b_iter in enumerate(iterators)]

    while intervals:
        boundaries = sorted(
            set([(inter.chrom, inter.start) for _, inter in intervals] +
                [(inter.chrom, inter.end)
                 for _, inter in intervals]))  # [(chr, boundary]), ...]
        schrom, start = boundaries[0]
        echrom, end = boundaries[1]
        assert schrom == echrom

        hits = []
        for ind, inter in intervals:
            if inter.chrom == schrom and inter.start <= start and end <= inter.end:
                hits.append(weights[ind])
        assert len(hits) > 0
        if sum(hits) >= threshold:
            consolidated = Interval(schrom, start, end)
            consolidated = merge(hits, consolidated)
            result.append(consolidated)

        # Push interval forward
        new_intervals = []
        for ind, inter in intervals:
            if inter.chrom != schrom:
                new_intervals.append((ind, inter))
                continue
            assert inter.start >= start

            if start == inter.start:
                assert end <= inter.end
                inter.start = end

            # request next interval
            if inter.start == inter.end:
                try:
                    inter = next(iterators[ind])
                    new_intervals.append((ind, inter))
                except StopIteration:
                    continue
            else:
                new_intervals.append((ind, inter))
        intervals = new_intervals
    return BedTool(result).sort().merge()
Esempio n. 6
0
def get_flanks(include_file, seeds, up_distance, down_distance):

    # .intervals returns the tree
    include = include_file.intervals

    def bad_cov(intervals):
        coverage = set()
        for i in intervals:
            coverage.update(range(i.start, i.end))
        return len(coverage)

    fmt = "{s.chrom}\t{s.start}\t{s.end}\t{t}\t{left}\t{right}\t{n_intervals}\n"
    for seed in seeds:
        seed_hits = include.all_hits(seed)

        # and either continue or yield None?
        if seed_hits == []: continue
        seed_hits = sorted(seed_hits, key=attrgetter('start'))

        if seed.strand == "-":
            region_left = Interval(seed.chrom,
                                   max(0, seed.start - down_distance),
                                   seed.start)
            region_right = Interval(seed.chrom, seed.end,
                                    seed.end + up_distance)
        else:
            region_left = Interval(seed.chrom, max(0,
                                                   seed.start - up_distance),
                                   seed.start)
            region_right = Interval(seed.chrom, seed.end,
                                    seed.end + down_distance)
        # this is needed, unfortunately
        region_right.file_type = region_left.file_type = "bed"

        # this won't handle overlapping include intervals...
        include_left = sorted(include.all_hits(region_left),
                              key=attrgetter('start'))
        include_right = sorted(include.all_hits(region_right),
                               key=attrgetter('start'))

        #assert bad_cov(include_right) == sum(i.length for i in include_right)
        #assert bad_cov(include_left) == sum(i.length for i in include_left)

        if include_left:
            # truncate to not include the seed point so we get unique for left, right
            # and add in the seed at the end
            include_left[-1].end = min(include_left[-1].end, seed.start)
            # adjust end-point so we get exactly pad
            include_left[0].start = max(include_left[0].start,
                                        region_left.start)

        if include_right:
            # truncate to region
            include_right[0].start = max(include_right[0].start, seed.end)
            include_right[-1].end = min(include_right[-1].end,
                                        region_right.end)

        #assert bad_cov(include_right) == sum(i.length for i in include_right)
        #assert bad_cov(include_left) == sum(i.length for i in include_left)

        assert include_left == [] or include_left[-1].end <= seed.start
        assert include_right == [] or seed.end <= include_right[0].start

        #null_bases = total_bases - l

        #assert bad_cov(include_left) == sum(i.length for i in include_left)
        #assert bad_cov(include_right) == sum(i.length for i in include_right)
        #assert bad_cov(seed_hits) == sum(i.length for i in seed_hits)

        # truncate seed_hits to actual seed region
        seed_hits[0].start = max(seed_hits[0].start, seed.start)
        seed_hits[-1].end = min(seed_hits[-1].end, seed.end)

        #assert bad_cov(seed_hits) == sum(i.length for i in seed_hits)

        flanks = include_left + include_right
        total_bases = sum(i.length for i in flanks) + sum(i.length
                                                          for i in seed_hits)
        #coverage = bad_cov(flanks)
        #assert coverage == total_bases
        yield flanks, seed_hits, total_bases, seed
Esempio n. 7
0
    def testRichCmp(self):

        # be obsessive . . .
        #
        # ==
        a = Interval("chr21", 100, 200)
        b = Interval("chr21", 100, 200)
        self.assert_(a == b)
        self.assertFalse(a != b)
        self.assert_(a <= b)
        self.assert_(a >= b)
        self.assertFalse(a < b)
        self.assertFalse(a > b)

        a = Interval("chr21", 100, 100)
        b = Interval("chr21", 100, 100)
        self.assert_(a == b)
        self.assertFalse(a != b)
        self.assert_(a <= b)
        self.assert_(a >= b)
        self.assertFalse(a < b)
        self.assertFalse(a > b)

        # != because of strand
        a = Interval("chr21", 100, 200, strand='+')
        b = Interval("chr21", 100, 200, strand='-')
        self.assertFalse(a == b)
        self.assert_(a != b)
        self.assertFalse(a <= b)
        self.assertFalse(a >= b)
        self.assertFalse(a < b)
        self.assertFalse(a > b)

        # a >= b
        a = Interval("chr21", 100, 300)
        b = Interval("chr21", 100, 200)
        self.assertFalse(a == b)
        self.assert_(a != b)
        self.assertFalse(a <= b)
        self.assert_(a >= b)
        self.assertFalse(a < b)
        self.assertFalse(a > b)

        # a <= b
        a = Interval("chr21", 100, 300)
        b = Interval("chr21", 300, 300)
        self.assertFalse(a == b)
        self.assert_(a != b)
        self.assert_(a <= b)
        self.assertFalse(a >= b)
        self.assertFalse(a < b)
        self.assertFalse(a > b)

        # a <= b
        a = Interval("chr21", 100, 300)
        b = Interval("chr21", 250, 300)
        self.assertFalse(a == b)
        self.assert_(a != b)
        self.assert_(a <= b)
        self.assertFalse(a >= b)
        self.assertFalse(a < b)
        self.assertFalse(a > b)

        # a < b
        a = Interval("chr21", 100, 200)
        b = Interval("chr21", 201, 300)
        self.assertFalse(a == b)
        self.assert_(a != b)
        self.assert_(a <= b)
        self.assertFalse(a >= b)
        self.assert_(a < b)
        self.assertFalse(a > b)

        # a > b
        a = Interval("chr21", 201, 300)
        b = Interval("chr21", 100, 200)
        self.assertFalse(a == b)
        self.assert_(a != b)
        self.assertFalse(a <= b)
        self.assert_(a >= b)
        self.assertFalse(a < b)
        self.assert_(a > b)

        # a != b
        a = Interval("none", 1, 100)
        b = Interval("chr21", 1, 100)
        self.assertFalse(a == b)
        self.assert_(a != b)
        self.assertFalse(a <= b)
        self.assertFalse(a >= b)
        self.assertFalse(a < b)
        self.assertFalse(a > b)

        # nested should raise NotImplementedError
        a = Interval("chr21", 100, 200)
        b = Interval("chr21", 50, 300)
        self.assertRaises(NotImplementedError, a.__eq__, b)
        self.assertRaises(NotImplementedError, a.__ne__, b)
        self.assertRaises(NotImplementedError, a.__le__, b)
        self.assertRaises(NotImplementedError, a.__ge__, b)
        self.assertRaises(NotImplementedError, a.__lt__, b)
        self.assertRaises(NotImplementedError, a.__gt__, b)
Esempio n. 8
0
    def __getitem__(self, idx):
        from pybedtools import Interval

        if self.fasta_extractor is None:
            # first call
            # Use normal fasta/bigwig extractors
            self.fasta_extractor = FastaExtractor(self.ds.fasta_file,
                                                  use_strand=True)

            self.bw_extractors = {
                task: [BigwigExtractor(track) for track in task_spec.tracks]
                for task, task_spec in self.ds.task_specs.items()
                if task in self.tasks
            }

            self.bias_bw_extractors = {
                task: [BigwigExtractor(track) for track in task_spec.tracks]
                for task, task_spec in self.ds.bias_specs.items()
            }

        # Get the genomic interval for that particular datapoint
        interval = Interval(
            self.dfm.iat[idx, 0],  # chrom
            self.dfm.iat[idx, 1],  # start
            self.dfm.iat[idx, 2])  # end

        # Transform the input interval (for say augmentation...)
        if self.interval_transformer is not None:
            interval = self.interval_transformer(interval)

        # resize the intervals to the desired widths
        target_interval = resize_interval(deepcopy(interval), self.peak_width)
        seq_interval = resize_interval(deepcopy(interval), self.seq_width)

        # This only kicks in when we specify the taskname from dataspec
        # to the 3rd column. E.g. it doesn't apply when using intervals_file
        interval_from_task = self.dfm.iat[
            idx, 3] if self.intervals_file is None else ''

        # extract DNA sequence + one-hot encode it
        sequence = self.fasta_extractor([seq_interval])[0]
        inputs = {"seq": sequence}

        # exctract the profile counts from the bigwigs
        cuts = {
            f"{task}/profile": _run_extractors(self.bw_extractors[task],
                                               [target_interval],
                                               sum_tracks=spec.sum_tracks)[0]
            for task, spec in self.ds.task_specs.items() if task in self.tasks
        }
        if self.track_transform is not None:
            for task in self.tasks:
                cuts[f'{task}/profile'] = self.track_transform(
                    cuts[f'{task}/profile'])

        # Add binary thing
        for i, task in enumerate(self.tasks):
            #print("active", self.dfm.iat[idx, (3+i)])

            cuts[f'{task}/activity'] = self.dfm.iat[idx, (4 + i)]

        # Add total number of counts
        for task in self.tasks:
            cuts[f'{task}/counts'] = self.total_count_transform(
                cuts[f'{task}/profile'].sum(0))

        if len(self.ds.bias_specs) > 0:
            # Extract the bias tracks
            biases = {
                bias_task: _run_extractors(self.bias_bw_extractors[bias_task],
                                           [target_interval],
                                           sum_tracks=spec.sum_tracks)[0]
                for bias_task, spec in self.ds.bias_specs.items()
            }

            task_biases = {
                f"bias/{task}/profile": np.concatenate(
                    [biases[bt] for bt in self.task_bias_tracks[task]],
                    axis=-1)
                for task in self.tasks
            }

            if self.track_transform is not None:
                for task in self.tasks:
                    task_biases[f'bias/{task}/profile'] = self.track_transform(
                        task_biases[f'bias/{task}/profile'])

            # Add total number of bias counts
            for task in self.tasks:
                task_biases[
                    f'bias/{task}/counts'] = self.total_count_transform(
                        task_biases[f'bias/{task}/profile'].sum(0))

            inputs = {**inputs, **task_biases}

        if self.include_classes:
            # Optionally, add binary labels from the additional columns in the tsv intervals file
            classes = {
                f"{task}/class": self.dfm.iat[idx, i + 3]
                for i, task in enumerate(self.dfm_tasks) if task in self.tasks
            }
            cuts = {**cuts, **classes}

        out = {"inputs": inputs, "targets": cuts}

        if self.include_metadata:
            # remember the metadata (what genomic interval was used)
            out['metadata'] = {
                "range":
                GenomicRanges(
                    chr=target_interval.chrom,
                    start=target_interval.start,
                    end=target_interval.stop,
                    id=idx,
                    strand=(target_interval.strand
                            if target_interval.strand is not None else "*"),
                ),
                "interval_from_task":
                interval_from_task
            }
        return out
Esempio n. 9
0
 def testOverlaps(self):
     i = Interval("chr21", 9719768, 9739768)
     hits = self.bed.all_hits(i)
     self.assertEqual(len(hits), 8)
     for hit in hits:
         self.assert_(hit.start <= 9739768 and hit.end >= 9719768)
Esempio n. 10
0
 def setUp(self):
     self.file = os.path.join(PATH, self.file)
     start, end, strand = 1, 100, "+"
     self.i = Interval("chr1", start, end, strand=strand)
     self.start, self.end, self.strand = start, end, strand
Esempio n. 11
0
 def _fetch(self, interval, istart, iend):
     seq = self.fasta.extract(Interval(interval.chrom, istart, iend))
     seq = Sequence(name=interval.chrom, seq=seq, start=istart, end=iend)
     return seq
Esempio n. 12
0
    def __init__(self, data, reverse_complement_bool=False, contig=None, strand=None):
        """
        Constructor

        :param data: genome of a being to be researched
        :type data: GFGenome or pyensembl.Genome object
        :param reverse_complement_bool: True if the reverse_complement of 5'UTR sequence for "-" strand is required
        :type reverse_complement_bool: bool
        :param contig: optional, number of the chromosome without 'chr'
        :type contig: str
        :param strand: optional, chromosome strand
        :type strand: char '+' or '-'
        :return: initializes the following attributes:

            - seq[] – list of 5' UTR sequences with exons and introns

            - seq_exons[] – list of 5' UTR sequences with only exons,
                NOTE it gives exons corresponding to the 5' UTR, therefore the last corresponding exon gets
                cropped at the start_codon_positions[0]

            - intervals[] – list of 5' UTR intervals

            - transcripts{} – dictionary: key – transcript id, value – index of the corresponding 5' UTR

            - exons{} – dictionary: key - 5' UTR index, value - list of tuples (exon sequence, exon interval)

        """

        # NOTE 2 5' UTR are considered to be equal only if both their seq and seq_exons are equal

        self.seq_exons = []
        self.intervals = []
        self.transcripts = {}
        self.exons = {}

        count = 0

        if strand is None:
            for transcript in data.transcripts(contig, '+'):
                if transcript.contains_start_codon:

                    temp_exon_list = []
                    start = transcript.start
                    start_pos = 0
                    for exon in transcript.exons:
                        if transcript.start_codon_positions[0] >= exon.start >= start:
                            if exon.end > transcript.start_codon_positions[0]:
                                temp_exon_list.append((transcript.five_prime_utr_sequence[start_pos:
                                                                                          (start_pos +
                                                                                           transcript.start_codon_positions[
                                                                                               0] - exon.start)],
                                                       Interval(transcript.contig, exon.start,
                                                                transcript.start_codon_positions[0] - 1,
                                                                exon.id, 0, "+")))  # dummy score
                            else:
                                temp_exon_list.append((transcript.five_prime_utr_sequence[start_pos:
                                                                                          (
                                                                                              start_pos + exon.end - exon.start + 1)],
                                                       Interval(transcript.contig, exon.start, exon.end, exon.id, 0,
                                                                "+")))  # dummy score
                                start_pos = start_pos + exon.end - exon.start + 1
                            start = exon.start

                    # apparently 2 5'UTRs can have the same exonic sequences but different exonic+intronic sequences
                        if transcript.five_prime_utr_sequence not in self.seq_exons:
                            self.seq_exons.append(transcript.five_prime_utr_sequence)
                            self.intervals.append(
                                Interval(transcript.contig, transcript.exons[0].start,
                                         transcript.exons[len(transcript.exons) - 1].end,
                                         "5' UTR", 0, "+"))  # dummy score
                            self.exons[count] = temp_exon_list
                            self.transcripts[transcript.id] = count
                            count = count + 1
                        else:
                            pos = self.seq_exons.index(transcript.five_prime_utr_sequence)
                            if (self.intervals[pos].strand == "+") and \
                                ((self.intervals[pos]).start != (transcript.exons[0]).start):
                                self.seq_exons.append(transcript.five_prime_utr_sequence)
                                self.intervals.append(
                                    Interval(transcript.contig, transcript.exons[0].start,
                                             transcript.exons[len(transcript.exons) - 1].end,
                                             "5' UTR", 0, "+"))  # dummy score
                                self.exons[count] = temp_exon_list
                                self.transcripts[transcript.id] = count
                                count = count + 1
                            else:
                                self.transcripts[transcript.id] = self.seq_exons.index(
                                    transcript.five_prime_utr_sequence)

            for transcript in data.transcripts(contig, '-'):
                if transcript.contains_start_codon:

                    temp_exon_list = []
                    end = transcript.end
                    temp_reverse_seq = reverse_complement(transcript.five_prime_utr_sequence)
                    start_pos = len(temp_reverse_seq)
                    for exon in transcript.exons:
                        if transcript.start_codon_positions[2] <= exon.end <= end:
                            if exon.start < transcript.start_codon_positions[2]:
                                temp_exon_list.append((temp_reverse_seq[:start_pos],
                                                       Interval(transcript.contig,
                                                                transcript.start_codon_positions[2] + 1, exon.end,
                                                                exon.id, 0,
                                                                "-")))  # dummy score
                            else:
                                temp_exon_list.append(
                                    (temp_reverse_seq[start_pos - (exon.end - exon.start + 1):start_pos],
                                     Interval(transcript.contig, exon.start, exon.end, exon.id, 0,
                                              "-")))  # dummy score
                                start_pos = start_pos - (exon.end - exon.start) - 1
                            end = exon.end

                    if reverse_complement_bool:
                        temp_exon_list_reverse = []
                        for temp_exon in temp_exon_list:
                            temp_exon_list_reverse.append((reverse_complement(temp_exon[0]), temp_exon[1]))
                        temp_exon_list = temp_exon_list_reverse
                        current_transcript_seq = transcript.five_prime_utr_sequence
                    else:
                        current_transcript_seq = reverse_complement(transcript.five_prime_utr_sequence)

                    if current_transcript_seq not in self.seq_exons:
                        self.seq_exons.append(current_transcript_seq)
                        self.intervals.append(Interval(transcript.contig, transcript.exons[0].start,
                                                       transcript.exons[len(transcript.exons) - 1].end, "5' UTR", 0,
                                                       "-"))  # dummy score
                        self.exons[count] = temp_exon_list
                        self.transcripts[transcript.id] = count
                        count = count + 1
                    else:
                        pos = self.seq_exons.index(current_transcript_seq)
                        if (self.intervals[pos].strand == "-") and \
                            (self.intervals[pos].start != transcript.exons[len(transcript.exons) - 1].end):
                            self.seq_exons.append(current_transcript_seq)
                            self.intervals.append(
                                Interval(transcript.contig, transcript.exons[0].start,
                                         transcript.exons[len(transcript.exons) - 1].end, "5' UTR", 0,
                                         "-"))  # dummy score
                            self.exons[count] = temp_exon_list
                            self.transcripts[transcript.id] = count
                            count = count + 1
                        else:
                            self.transcripts[transcript.id] = self.seq_exons.index(current_transcript_seq)
        else:
            if strand == "+":
                for transcript in data.transcripts(contig, '+'):
                    if transcript.contains_start_codon:

                        temp_exon_list = []
                        start = transcript.start
                        start_pos = 0
                        for exon in transcript.exons:
                            if transcript.start_codon_positions[0] >= exon.start >= start:
                                if exon.end > transcript.start_codon_positions[0]:
                                    temp_exon_list.append((transcript.five_prime_utr_sequence[start_pos:
                                                                                              (start_pos +
                                                                                               transcript.start_codon_positions[
                                                                                                   0] - exon.start)],
                                                           Interval(transcript.contig, exon.start,
                                                                    transcript.start_codon_positions[0] - 1,
                                                                    exon.id, 0, "+")))  # dummy score
                                else:
                                    temp_exon_list.append((transcript.five_prime_utr_sequence[start_pos:
                                                                                              (
                                                                                                  start_pos + exon.end - exon.start + 1)],
                                                           Interval(transcript.contig, exon.start, exon.end, exon.id, 0,
                                                                    "+")))  # dummy score
                                    start_pos = start_pos + exon.end - exon.start + 1
                                start = exon.start

                    # apparently 2 5'UTRs can have the same exonic sequences but different exonic+intronic sequences
                        if transcript.five_prime_utr_sequence not in self.seq_exons:
                            self.seq_exons.append(transcript.five_prime_utr_sequence)
                            self.intervals.append(
                                Interval(transcript.contig, transcript.exons[0].start,
                                         transcript.exons[len(transcript.exons) - 1].end,
                                         "5' UTR", 0, "+"))  # dummy score
                            self.exons[count] = temp_exon_list
                            self.transcripts[transcript.id] = count
                            count = count + 1
                        else:
                            pos = self.seq_exons.index(transcript.five_prime_utr_sequence)
                            if (self.intervals[pos].strand == "+") and \
                                ((self.intervals[pos]).start != (transcript.exons[0]).start):
                                self.seq_exons.append(transcript.five_prime_utr_sequence)
                                self.intervals.append(
                                    Interval(transcript.contig, transcript.exons[0].start,
                                             transcript.exons[len(transcript.exons) - 1].end,
                                             "5' UTR", 0, "+"))  # dummy score
                                self.exons[count] = temp_exon_list
                                self.transcripts[transcript.id] = count
                                count = count + 1
                            else:
                                self.transcripts[transcript.id] = self.seq_exons.index(
                                    transcript.five_prime_utr_sequence)

            else:
                for transcript in data.transcripts(contig, '-'):
                    if transcript.contains_start_codon:

                        temp_exon_list = []
                        end = transcript.end
                        temp_reverse_seq = reverse_complement(transcript.five_prime_utr_sequence)
                        start_pos = len(temp_reverse_seq)
                        for exon in transcript.exons:
                            if transcript.start_codon_positions[2] <= exon.end <= end:
                                if exon.start < transcript.start_codon_positions[2]:
                                    temp_exon_list.append((temp_reverse_seq[:start_pos],
                                                           Interval(transcript.contig,
                                                                    transcript.start_codon_positions[2] + 1, exon.end,
                                                                    exon.id, 0,
                                                                    "-")))  # dummy score
                                else:
                                    temp_exon_list.append(
                                        (temp_reverse_seq[start_pos - (exon.end - exon.start + 1):start_pos],
                                         Interval(transcript.contig, exon.start, exon.end, exon.id, 0,
                                                  "-")))  # dummy score
                                    start_pos = start_pos - (exon.end - exon.start) - 1
                                end = exon.end

                        if reverse_complement_bool:
                            temp_exon_list_reverse = []
                            for temp_exon in temp_exon_list:
                                temp_exon_list_reverse.append((reverse_complement(temp_exon[0]), temp_exon[1]))
                            temp_exon_list = temp_exon_list_reverse
                            current_transcript_seq = transcript.five_prime_utr_sequence
                        else:
                            current_transcript_seq = reverse_complement(transcript.five_prime_utr_sequence)

                        if current_transcript_seq not in self.seq_exons:
                            self.seq_exons.append(current_transcript_seq)
                            self.intervals.append(Interval(transcript.contig, transcript.start_codon_positions[2] + 1,
                                                           transcript.end, "5' UTR", 0, "-"))  # dummy score
                            self.exons[count] = temp_exon_list
                            self.transcripts[transcript.id] = count
                            count = count + 1
                        else:
                            pos = self.seq_exons.index(current_transcript_seq)
                            if (self.intervals[pos].strand == "-") and \
                                (self.intervals[pos].start != transcript.exons[len(transcript.exons) - 1].end):
                                self.seq_exons.append(current_transcript_seq)
                                self.intervals.append(
                                    Interval(transcript.contig, transcript.start_codon_positions[2] + 1, transcript.end,
                                             "5' UTR", 0, "-"))  # dummy score
                                self.exons[count] = temp_exon_list
                                self.transcripts[transcript.id] = count
                                count = count + 1
                            else:
                                self.transcripts[transcript.id] = self.seq_exons.index(current_transcript_seq)
Esempio n. 13
0
 def interval(self):
     return Interval(self.chrom, self.start, self.end, self.out_fname)
Esempio n. 14
0
 def setUpClass(cls):
     cls.intervals = [
         Interval("ref1", 0, 5, "file1"),
         Interval("ref2", 10, 12, "file2"),
     ]
Esempio n. 15
0
    def get_nucleobase_mutation_table(self, vcf):
        """
        Get a table which shows whether a certain nucleobase in Kozak sequence or stop codon context was mutated or not.

        :param vcf: path to the vcf.gz or file opened using cyvcf2
        :type vcf: string or an "opened" file
        :return: pd.DataFrame,

                column names – K_i – where i shows position in Kozak sequence;
                S_i – where i shows position in stop codon context; gene_id; transcript_id

                rows – NaN – no variant, 1 – heterozygous variant, 2 – homozygous variant
        """

        # only for Kozak sequence and stop codon context + transcript_id column
        columns = [
            "K_0", "K_1", "K_2", "K_3", "K_4", "K_5", "K_6", "K_7", "K_8",
            "K_9", "K_10", "K_11", "K_12", "K_13", "K_14", "S_0", "S_1", "S_2",
            "S_3", "S_4", "S_5", "S_6", "S_7", "S_8", "S_9", "S_10", "S_11",
            "S_12", "S_13", "S_14", "transcript_id", "gene_id", "name"
        ]
        df_nucleobases = pd.DataFrame(columns=columns)
        nucleobases_lines = []

        mutator = VCFMutator(False, True, vcf, True)

        contigs = [
            '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
            '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X',
            'Y'
        ]

        for contig in contigs:
            for transcript in self.transcripts(contig, '+'):
                if transcript.contains_stop_codon and transcript.contains_start_codon:
                    Kozak_seq = transcript.get_Kozak_seq()
                    Interval_Kozak = Interval(
                        "chr" + transcript.contig,
                        transcript.start_codon_positions[0] - 6,
                        transcript.start_codon_positions[0] + 9, "NA", 0, "+")
                    stop_codon_context = transcript.get_stop_codon_context()
                    Interval_stop = Interval(
                        "chr" + transcript.contig,
                        transcript.stop_codon_positions[0] - 6,
                        transcript.stop_codon_positions[0] + 9, "NA", 0, "+")
                    df_nucleobases_line = mutator.mutate_codon_context(
                        [Interval_Kozak, Interval_stop],
                        [Kozak_seq, stop_codon_context], ["K_", "S_"])
                    if len(Kozak_seq) < 15:
                        new_columns = []
                        for column in df_nucleobases_line:
                            if column.find("K_") != -1:
                                new_columns.append("K_" + str(
                                    int(column[2:]) + (15 - len(Kozak_seq))))
                            else:
                                new_columns.append(column)
                        df_nucleobases_line.columns = new_columns

                    df_nucleobases_line["transcript_id"] = transcript.id
                    df_nucleobases_line["gene_id"] = transcript.gene_id
                    nucleobases_lines.append(df_nucleobases_line)
            for transcript in self.transcripts(contig, '-'):
                if transcript.contains_stop_codon and transcript.contains_start_codon:
                    Kozak_seq = reverse_complement(transcript.get_Kozak_seq())
                    Interval_Kozak = Interval(
                        "chr" + transcript.contig,
                        transcript.start_codon_positions[0] - 6,
                        transcript.start_codon_positions[0] + 9, "NA", 0, "-")
                    stop_codon_context = reverse_complement(
                        transcript.get_stop_codon_context())
                    Interval_stop = Interval(
                        "chr" + transcript.contig,
                        transcript.stop_codon_positions[0] - 6,
                        transcript.stop_codon_positions[0] + 9, "NA", 0, "-")
                    df_nucleobases_line = mutator.mutate_codon_context(
                        [Interval_Kozak, Interval_stop],
                        [Kozak_seq, stop_codon_context], ["K_", "S_"])
                    if len(Kozak_seq) < 15:
                        new_columns = []
                        for column in df_nucleobases_line:
                            if column.find("K_") != -1:
                                new_columns.append("K_" + str(
                                    int(column[2:]) + (15 - len(Kozak_seq))))
                            else:
                                new_columns.append(column)
                        df_nucleobases_line.columns = new_columns

                    df_nucleobases_line["transcript_id"] = transcript.id
                    df_nucleobases_line["gene_id"] = transcript.gene_id
                    nucleobases_lines.append(df_nucleobases_line)

            df_nucleobases = pd.concat(nucleobases_lines, ignore_index=True)
            df_nucleobases = df_nucleobases.drop(['name'], axis=1)
        return df_nucleobases
Esempio n. 16
0
def _draw_gene_annotation(fig, genes, chrom, start, end):
    wbed = BedTool([Interval(chrom, start, end)])
    regions = genes.intersect(wbed, wa=True, u=True)
    xs = []
    ys = []

    textxpos = []
    textypos = []
    names = []
    offset = 3.5 - .07
    prevends = 0

    rangeannot = []
    for i, region in enumerate(regions):
        names.append(region.name)

        #if region.start >= (prevends +10000):
        #    offset = 3.5-.07

        x, y = draw_gene(offset, region)
        xs += x
        ys += y
        textxpos.append(region.end + 500)
        textypos.append(offset)
        rangeannot.append(
            f"{region.chrom}:{region.start}-{region.end};{region.strand}")

        prevends = max(prevends, region.end)
        offset -= 0.2
        if offset <= 0.0:
            offset = 3.5 - 0.07

    if len(textxpos) > 0:
        plobjs = [
            go.Scatter(
                x=xs,
                y=ys,
                mode="lines",
                fill="toself",
                name="Genes",
                marker=dict(color="goldenrod"),
            ),
            go.Scatter(
                x=textxpos,
                y=textypos,
                text=names,
                mode="text",
                #opacity=0.0,
                name="Genes",
                customdata=rangeannot,
                hovertemplate="%{text}<br>%{customdata}",
                showlegend=False,
            ),
            #go.Scatter(
            #    x=xs,
            #    y=ys,
            #    mode="lines",
            #    #fill="toself",
            #    #name="Genes",
            #    #marker=dict(color="black"),
            #    line=dict(color='black', width=.1),
            #    showlegend=False,
            #),
        ]
        return plobjs
Esempio n. 17
0
def vcfrec2interval(record):
    """Given a VCF record object, return a pybedtools Interval object."""
    # NOTE: we need to do coordinate conversion manually
    return Interval(record.CHROM, record.POS - 1, record.POS)
Esempio n. 18
0
    def flow(self):
        """Data flow generator."""

        refs = np.zeros(
            (self.batch_size, self.binsize - self.bioseq.garray.order + 1, 1,
             pow(self.bioseq._alphabetsize, self.bioseq.garray.order)))
        alts = np.zeros_like(refs)

        vcf = VariantFile(self.variants).fetch()

        if self.annotation is not None:
            varbed = BedTool(self.variants)
            n_vcf_fields = len(varbed[0].fields)
            vcf_strand_augment = iter(
                varbed.intersect(self.annotation, loj=True))

        try:
            while True:
                # construct genomic region
                names = []
                chroms = []
                poss = []
                rallele = []
                aallele = []

                ibatch = 0

                while ibatch < self.batch_size:
                    rec = next(vcf)
                    rec_strandedness = '+'
                    if self.annotation is not None:
                        rec_aug = next(vcf_strand_augment)
                        rec_strandedness = '-' if '-' in rec_aug[
                            n_vcf_fields:] else '+'

                    if not self.is_compatible(rec):
                        continue

                    start, end = self.get_interval(rec)

                    names.append(rec.id if rec.id is not None else '')
                    chroms.append(rec.chrom)
                    poss.append(rec.pos - 1)
                    rallele.append(rec.ref.upper())
                    aallele.append(rec.alts[0].upper())

                    iref = self.bioseq._getsingleitem(
                        Interval(rec.chrom, start, end)).copy()
                    ialt = iref.copy()

                    for o in range(self.bioseq.garray.order):

                        irefbase = iref[self.binsize // 2 + o -
                                        self.bioseq.garray.order +
                                        (0 if self.binsize % 2 == 0 else 1)]
                        irefbase = irefbase // pow(self.bioseq._alphabetsize,
                                                   o)
                        irefbase = irefbase % self.bioseq._alphabetsize

                        if self.ignore_reference_match:
                            # process the variant even if
                            # it does not match with the reference base
                            replacement = (NMAP[rec.ref.upper()] -
                                           irefbase) * \
                                           pow(self.bioseq._alphabetsize, o)

                            iref[self.binsize // 2 + o -
                                 self.bioseq.garray.order +
                                 (0 if self.binsize %
                                  2 == 0 else 1)] += replacement

                            replacement = (NMAP[rec.alts[0].upper()] -
                                           irefbase) * \
                                          pow(self.bioseq._alphabetsize, o)

                            ialt[self.binsize // 2 + o -
                                 self.bioseq.garray.order +
                                 (0 if self.binsize %
                                  2 == 0 else 1)] += replacement
                            continue

                        if NMAP[rec.ref.upper()] != irefbase:
                            self.logger.info(
                                'VCF reference and reference genome not compatible.'
                                'Expected reference {}, but VCF indicates {}.'.
                                format(irefbase, NMAP[rec.ref.upper()]) +
                                'VCF-Record: {}:{}-{}>{};{}. Skipped.'.format(
                                    rec.chrom, rec.pos, rec.ref, rec.alts[0],
                                    rec.id))
                        else:
                            # at this point, it is ensured that the VCF reference
                            # agrees with the reference genome.
                            replacement = (NMAP[rec.alts[0].upper()] -
                                           NMAP[rec.ref.upper()]) * \
                                          pow(self.bioseq._alphabetsize, o)

                            ialt[self.binsize // 2 + o -
                                 self.bioseq.garray.order +
                                 (0 if self.binsize %
                                  2 == 0 else 1)] += replacement

                    if rec_strandedness == '-':
                        ialt = self.bioseq._revcomp(ialt)
                        iref = self.bioseq._revcomp(iref)

                    alt = as_onehot(ialt[None, :], self.bioseq.garray.order,
                                    self.bioseq._alphabetsize)

                    alts[ibatch] = alt

                    ref = as_onehot(iref[None, :], self.bioseq.garray.order,
                                    self.bioseq._alphabetsize)
                    refs[ibatch] = ref

                    ibatch += 1
                yield names, chroms, poss, rallele, aallele, refs, alts

        except StopIteration:
            refs = refs[:ibatch]
            alts = alts[:ibatch]

            yield names, chroms, poss, rallele, aallele, refs, alts
Esempio n. 19
0
def test_logzscore_normalization(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath

    def loading(garray):
        garray[Interval('chr1', 0, 150), 0] = np.repeat(10, 150).reshape(-1, 1)
        garray[Interval('chr2', 0, 300), 0] = np.repeat(100,
                                                        300).reshape(-1, 1)
        return garray

    from janggu.data.genomicarray import LogTransform, ZScore
    ga = create_genomic_array(GenomicIndexer.create_from_genomesize({
        'chr1': 150,
        'chr2': 300
    }),
                              stranded=False,
                              typecode='float32',
                              storage='ndarray',
                              cache=None,
                              loader=loading)
    ga = create_genomic_array(GenomicIndexer.create_from_genomesize({
        'chr1': 150,
        'chr2': 300
    }),
                              stranded=False,
                              typecode='float32',
                              storage='ndarray',
                              cache=None,
                              loader=loading,
                              normalizer=[LogTransform()])
    ga = create_genomic_array(GenomicIndexer.create_from_genomesize({
        'chr1': 150,
        'chr2': 300
    }),
                              stranded=False,
                              typecode='float32',
                              storage='ndarray',
                              cache=None,
                              loader=loading,
                              normalizer=[ZScore()])
    ga = create_genomic_array(GenomicIndexer.create_from_genomesize({
        'chr1': 150,
        'chr2': 300
    }),
                              stranded=False,
                              typecode='float32',
                              storage='ndarray',
                              cache=None,
                              loader=loading,
                              normalizer=[LogTransform(),
                                          ZScore()])
    ga = create_genomic_array(GenomicIndexer.create_from_genomesize({
        'chr1': 150,
        'chr2': 300
    }),
                              stranded=False,
                              typecode='float32',
                              storage='ndarray',
                              cache=None,
                              loader=loading,
                              normalizer=['zscorelog'])
    for store in ['ndarray', 'hdf5']:
        ga = create_genomic_array(GenomicIndexer.create_from_genomesize({
            'chr1':
            150,
            'chr2':
            300
        }),
                                  stranded=False,
                                  typecode='float32',
                                  storage=store,
                                  cache="cache_file",
                                  loader=loading,
                                  normalizer=['zscorelog'])
        np.testing.assert_allclose(ga.weighted_mean(),
                                   np.asarray([0.0]),
                                   rtol=1e-5,
                                   atol=1e-5)
        np.testing.assert_allclose(ga.weighted_sd(),
                                   np.asarray([1.]),
                                   rtol=1e-5,
                                   atol=1e-5)
        np.testing.assert_allclose(ga[Interval('chr1', 100, 101)],
                                   np.asarray([[[-1.412641340027806]]]),
                                   rtol=1e-5,
                                   atol=1e-5)
        np.testing.assert_allclose(ga[Interval('chr2', 100, 101)],
                                   np.asarray([[[0.706320670013903]]]),
                                   rtol=1e-5,
                                   atol=1e-5)
Esempio n. 20
0
    def flow(self):
        """Data flow generator."""

        refs = np.zeros(
            (self.batch_size, self.binsize - self.bioseq.garray.order + 1, 1,
             pow(self.bioseq._alphabetsize, self.bioseq.garray.order)))
        alts = np.zeros_like(refs)

        # get variants
        vcf = VariantFile(self.variants).fetch()

        def _get_replacement(new_nucleotide, previous_nucleotide, o):
            # helper function for replacing old with new nucleotides
            return (new_nucleotide - previous_nucleotide) * \
                   pow(self.bioseq._alphabetsize, o)

        # annotation is used to inform about the strandedness
        # to evaluate the variant
        if self.annotation is not None:
            varbed = BedTool(self.variants)
            n_vcf_fields = len(varbed[0].fields)
            vcf_strand_augment = iter(
                varbed.intersect(self.annotation, loj=True))

        try:
            while True:
                # construct genomic region
                names = []
                chroms = []
                poss = []
                rallele = []
                aallele = []

                ibatch = 0

                # prepare mini-batches of variants
                while ibatch < self.batch_size:
                    rec = next(vcf)
                    rec_strandedness = '+'
                    if self.annotation is not None:
                        rec_aug = next(vcf_strand_augment)
                        rec_strandedness = '-' if '-' in rec_aug[
                            n_vcf_fields:] else '+'

                    if not self.is_compatible(rec):
                        continue

                    start, end = self.get_interval(rec)

                    names.append(rec.id if rec.id is not None else '')
                    chroms.append(rec.chrom)
                    poss.append(rec.pos - 1)
                    rallele.append(rec.ref.upper())
                    aallele.append(rec.alts[0].upper())

                    # obtain the nucleotide indices around the variant
                    iref = self.bioseq._getsingleitem(
                        Interval(rec.chrom, start, end)).copy()
                    ialt = iref.copy()

                    for o in range(self.bioseq.garray.order):
                        # in the loop we adjust the original DNA sequence
                        # by using the alternative alleele instead
                        #
                        # the loop is required for the higher-order nucleotide representation
                        # in which a single variant position affects multiple
                        # mutually overlapping positions in the one-hot encoding
                        #
                        # furthermore, the alternative alleele is only set if
                        # the reference alleele matches with the reference genome.
                        # unless the ignore_reference_match option was used.

                        # this is the positions at which to change the nucleotide
                        position_to_change = self.binsize//2 + o - \
                                          self.bioseq.garray.order + \
                                          (0 if self.binsize%2 == 0 else 1)

                        # determine the reference nucleotide
                        # this would be just irefbase itself for order=1
                        # but for higher-order representation it needs to
                        # be determined. e.g. for TT for order=2 would be irefbase==15
                        # which should give the nucleotides 3, 3
                        irefbase = iref[position_to_change]
                        irefbase = irefbase // pow(self.bioseq._alphabetsize,
                                                   o)
                        irefbase = irefbase % self.bioseq._alphabetsize

                        if self.ignore_reference_match:
                            # process the variant even if
                            # it does not match with the reference base

                            # replace nucleotides in the reference
                            # and in the alternative alleele
                            iref[position_to_change] += _get_replacement(
                                NMAP[rec.ref.upper()], irefbase, o)

                            ialt[position_to_change] += _get_replacement(
                                NMAP[rec.alts[0].upper()], irefbase, o)
                            continue

                        if NMAP[rec.ref.upper()] != irefbase:
                            self.logger.info(
                                'VCF reference and reference genome not compatible.'
                                'Expected reference {}, but VCF indicates {}.'.
                                format(irefbase, NMAP[rec.ref.upper()]) +
                                'VCF-Record: {}:{}-{}>{};{}. Skipped.'.format(
                                    rec.chrom, rec.pos, rec.ref, rec.alts[0],
                                    rec.id))
                        else:
                            # at this point, it is ensured that the VCF reference
                            # agrees with the reference genome.
                            # keep the reference as it is, only change
                            #  the alternative alleele

                            ialt[position_to_change] += _get_replacement(
                                NMAP[rec.alts[0].upper()],
                                NMAP[rec.ref.upper()], o)

                    # if the strandedness is negative (from the annotation)
                    # the DNA sequences are reverse complemented
                    if rec_strandedness == '-':
                        ialt = self.bioseq._revcomp(ialt)
                        iref = self.bioseq._revcomp(iref)

                    alt = as_onehot(ialt[None, :], self.bioseq.garray.order,
                                    self.bioseq._alphabetsize)

                    alts[ibatch] = alt

                    ref = as_onehot(iref[None, :], self.bioseq.garray.order,
                                    self.bioseq._alphabetsize)
                    refs[ibatch] = ref

                    ibatch += 1
                yield names, chroms, poss, rallele, aallele, refs, alts

        except StopIteration:
            refs = refs[:ibatch]
            alts = alts[:ibatch]

            yield names, chroms, poss, rallele, aallele, refs, alts
Esempio n. 21
0
 def loading(garray):
     garray[Interval('chr1', 0, 150),
            0] = np.random.normal(loc=10, size=150).reshape(-1, 1)
     garray[Interval('chr2', 0, 300),
            0] = np.random.normal(loc=100, size=300).reshape(-1, 1)
     return garray
Esempio n. 22
0
parser.add_argument('--outdir',
                    nargs='?',
                    required=True,
                    type=str,
                    help="Path to the output directory for the plots")
parser.add_argument('--plotformat',
                    nargs='?',
                    default='png',
                    type=str,
                    help="Plot format")
args = parser.parse_args()

region = BedTool([
    Interval('chr1',
             args.start,
             args.end,
             strand='+',
             score='0',
             name='region')
])

exp2protein = {}
with open(args.table) as f:
    next(f)
    for l in f:
        a = l.strip().split("\t")
        exp2protein[".".join(a[1].split(".")[:-1])] = a[3]
#print(exp2protein)

peakfiles = [
    os.path.join(args.path, f) for f in listdir(args.path)
    if isfile(os.path.join(args.path, f)) and 'annotated' in f
Esempio n. 23
0
def test_check_resolution_collapse_compatibility(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath

    def loading(garray):
        garray[Interval('chr1', 0, 150), 0] = np.repeat(10, 150).reshape(-1, 1)
        garray[Interval('chr2', 0, 300), 0] = np.repeat(1, 300).reshape(-1, 1)
        return garray

    with pytest.raises(Exception):
        # Error because resolution=50 but no collapser defined
        ga = create_genomic_array(GenomicIndexer.create_from_genomesize({
            'chr1':
            150,
            'chr2':
            300
        }),
                                  stranded=False,
                                  typecode='float32',
                                  storage="ndarray",
                                  cache=None,
                                  resolution=50,
                                  loader=loading,
                                  collapser=None,
                                  normalizer=['tpm'])

    with pytest.raises(Exception):
        # Error because resolution=None but no collapser defined
        ga = create_genomic_array(GenomicIndexer.create_from_genomesize({
            'chr1':
            150,
            'chr2':
            300
        }),
                                  stranded=False,
                                  typecode='float32',
                                  storage="ndarray",
                                  cache=None,
                                  resolution=None,
                                  loader=loading,
                                  collapser=None,
                                  normalizer=['tpm'])

    ga = create_genomic_array(GenomicIndexer.create_from_file(
        [
            Interval('chr1', 0, 150),
            Interval('chr2', 0, 150),
            Interval('chr2', 150, 300)
        ],
        binsize=150,
        stepsize=None,
    ),
                              stranded=False,
                              typecode='float32',
                              storage="ndarray",
                              cache=None,
                              resolution=1,
                              loader=loading)
    ga = create_genomic_array(GenomicIndexer.create_from_file(
        [Interval('chr1', 0, 150),
         Interval('chr2', 0, 300)],
        binsize=None,
        stepsize=None,
        collapse=True),
                              stranded=False,
                              typecode='float32',
                              storage="ndarray",
                              cache='test',
                              resolution=None,
                              loader=loading,
                              store_whole_genome=None,
                              collapser='sum')
    ga = create_genomic_array(GenomicIndexer.create_from_file(
        [Interval('chr1', 0, 150),
         Interval('chr2', 0, 300)],
        binsize=None,
        stepsize=None,
        collapse=True),
                              stranded=False,
                              typecode='float32',
                              storage="ndarray",
                              cache=None,
                              resolution=None,
                              loader=loading,
                              collapser='sum',
                              normalizer=['tpm'])
Esempio n. 24
0
def profile_counts_fragments(file,
                             genomicregion,
                             selected_barcodes=None,
                             binsize=50):
    """ Generates pseudo-bulk tracks.

    Parameters
    ----------
    file : str
       Input bam file.
    genomicregion : str
       Genomic coordinates. E.g. 'chr1:5000-10000'
    selected_barcodes : list(str) or None
       Contains a list of barcodes to consider for the profile.
       If None, all barcodes are considered. Default=None.
    binsize : int
       Resolution of the signal track in bp. Default: 50

    Returns
    -------
    anndata.AnnData
       AnnData object containing the read counts for the given locus.

    """

    bed = BedTool(file)

    def split_iv(gr):
        chr_, res = gr.split(':')
        start, end = res.split('-')
        return chr_, int(start), int(end)

    chrom, start, end = split_iv(genomicregion)
    intersect = bed.intersect(BedTool([Interval(chrom, start, end)]), wa=True)
    if len(intersect) == 0:
        raise ValueError(f'No data in {genomicregion}')

    positions = []
    cells = []
    barcodemap = OrderedDict()
    if selected_barcodes is not None:
        for i, sb in enumerate(selected_barcodes):
            barcodemap[sb] = i

    for region in intersect:
        bar = region.name
        if selected_barcodes is not None:
            if bar not in selected_barcodes:
                # skip barcode if not in selected_barcodes list
                continue
        if bar not in barcodemap:
            barcodemap[bar] = len(barcodemap)
        if region.start >= start:
            positions.append(region.start - start)
            cells.append(barcodemap[bar])
        if region.end < end:
            positions.append(region.end - start)
            cells.append(barcodemap[bar])

    smat = coo_matrix((np.ones(len(positions)), (positions, cells)),
                      shape=(end - start + 1, len(barcodemap)),
                      dtype='int32')

    # smoothing with binsize resolution
    data = np.ones((binsize, smat.shape[0]))
    offsets = np.arange(binsize)
    di = dia_matrix((data, offsets), shape=(smat.shape[0], smat.shape[0]))
    smat = di.dot(smat).tocsr()
    smat = smat[::binsize]

    var = pd.DataFrame({
        'chrom': [chrom] * int(np.ceil((end - start + 1) / binsize)),
        'start':
        np.arange(start, end + 1, binsize),
        'end':
        np.arange(start + binsize, end + binsize + 1, binsize)
    })

    obs = pd.DataFrame(index=[bc for bc in barcodemap])
    adata = AnnData(smat.T.tocsr(), obs=obs, var=var)
    adata.raw = adata
    return adata
Esempio n. 25
0
 def loading(garray):
     garray[Interval('chr1', 0, 150), 0] = np.repeat(10, 150).reshape(-1, 1)
     garray[Interval('chr2', 0, 300), 0] = np.repeat(1, 300).reshape(-1, 1)
     return garray
Esempio n. 26
0
def test_fasta_extractor_over_chr_end():
    extractor = FastaExtractor('tests/data/fasta_test.fa')
    intervals = [Interval('chr1', 0, 100), Interval('chr1', 1, 101)]
    with pytest.raises(ValueError):
        data = extractor(intervals)
Esempio n. 27
0
                    help="Manimum allowed maximum AT content inside a peak")
args = parser.parse_args()

at_content_dict = coverage2dict(args.attrack)

#get gc drops
gcdrops = defaultdict(list)
with open(args.atdrops) as f:
    current_drop = []
    for l in f:
        a = l.strip().split("\t")
        if (not current_drop and a[3] == 'max'):
            current_drop.append(int(a[1]))
        if (current_drop and a[3] == 'min'):
            current_drop.append(int(a[1]))
            gcdrops[a[0]].append(tuple(current_drop))
            current_drop = []

for chrom, positions in gcdrops.items():
    for c, (start, end) in enumerate(positions, start=1):
        score = max(at_content_dict[chrom][start:end])
        if (score >= args.minat):
            sys.stdout.write(
                str(
                    Interval(chrom,
                             start,
                             end,
                             name="drop_%s_%d" % (chrom, c),
                             score="%1.3f" % score,
                             strand='+')))
Esempio n. 28
0
 def setUp(self):
     self.file = os.path.join(PATH, self.file)
     start, end, strand = 9719768, 9739768, "-"
     self.i = Interval("chr21", start, end, strand=strand)
     self.start, self.end, self.strand = start, end, strand
Esempio n. 29
0
    for s in sequence:
        if (s == 'A' and prev == 'T'):
            count += 1
            prev = ''

        elif (s == 'T' and prev == 'A'):
            count += 1
            prev = ''

        else:
            prev = s
    return count


atstretches = BedTool([
    Interval(x.chrom, x.start, x.end, x.name,
             str(count_at_steps(x.attrs['seq'])), x.strand)
    for x in atstretches if check_motif(x.attrs['seq'])
])
#for interval in atstretches:
#interval[5] = str(count_at_steps(interval.attrs['seq']));

#get a division for the at stretches based on their lengthes
lengthes = np.array([len(x) for x in atstretches])
l_division = [
    7, 9, 11, 14, 17, 20, 30, 40, 50
]  # [ int(x) for x in sorted(list(set(np.percentile(lengthes, np.linspace(0, 100, 18)))))];
l_division[-1] += 1
#gccounts = [int(x.score) for x in atstretches]
#gc_division = [int(x) for x in sorted(set(gccounts))]
ac_division = [
    0, 1, 2, 3, 4, 5, 6, 7, 11,