예제 #1
0
파일: bed2table.py 프로젝트: siping/cgat
    def count( self, bed ):
        '''update internal counts.'''

        results = []
        for track in self.tracks:
            try:
                overlaps = [ (x[0],x[1]) for x in self.index[track][bed.contig].find( bed.start, bed.end ) ]
            except KeyError:
                overlaps = []

            results.append( (len(overlaps), 
                             Intervals.calculateOverlap( [(bed.start, bed.end),],
                                                         Intervals.combine( overlaps ) ) ) )

        self.data = results
예제 #2
0
    def count( self, bed ):
        '''update internal counts.'''

        results = []
        for track in self.tracks:
            try:
                overlaps = [ (x[0],x[1]) for x in self.index[track][bed.contig].find( bed.start, bed.end ) ]
            except KeyError:
                overlaps = []

            results.append( (len(overlaps), 
                             Intervals.calculateOverlap( [(bed.start, bed.end),],
                                                         Intervals.combine( overlaps ) ) ) )

        self.data = results
예제 #3
0
파일: bed2bed.py 프로젝트: Q-KIM/cgat
def merge(iterator,
          max_distance=0,
          by_name=False,
          min_intervals=1,
          remove_inconsistent=False,
          resolve_blocks=False,
          stranded=False):
    """iterator for merging adjacent bed entries.

    *max_distance* > 0 permits merging of intervals that are
    not directly adjacent.

    If *by_name = True*, only entries with the same name are merged.

    If *remove_inconsistent*, overlapping intervals where the names
    are inconsistent will be removed.

    The score gives the number of intervals that have been merged.
    """

    if remove_inconsistent and by_name:
        assert ValueError(
            "using both remove_inconsistent and by_name makes no sense")

    def iterate_chunks(iterator):
        max_end = defaultdict(int)
        to_join = defaultdict(list)
        last_name = defaultdict(str)

        last = iterator.next()

        if not stranded:
            strand = "."
        else:
            strand = last.strand

        max_end[strand] = last.end
        to_join[strand] = [last]

        for bed in iterator:

            if not stranded:
                strand = "."
            else:
                strand = bed.strand

            d = bed.start - max_end[strand]

            if bed.contig == last.contig:
                assert bed.start >= last.start, \
                    "input file should be sorted by contig and position: d=%i:\n%s\n%s\n" \
                    % (d, last, bed)

            if bed.contig != last.contig:

                for s in to_join:
                    if to_join[s]:
                        yield to_join[s]
                    to_join[s] = []
                    max_end[s] = 0

            elif (d > max_distance or
                  (by_name and last_name[strand] != bed.name)):

                if to_join[strand]:
                    yield to_join[strand]

                to_join[strand] = []

            last = bed
            last_name[strand] = last.name
            max_end[strand] = max(bed.end, max_end[strand])
            to_join[strand].append(bed)

        for strand in to_join:
            if to_join[strand]:
                yield to_join[strand]
        raise StopIteration

    c = E.Counter()

    for to_join in iterate_chunks(iterator):

        c.input += 1

        if remove_inconsistent:
            names = set([x.name for x in to_join])
            if len(names) > 1:
                c.skipped_inconcistent_intervals += 1
                continue

        if resolve_blocks:
            
            # keep track of number of intervals in each entry
            for bed in to_join:
                bed["score"] = 1
  
            merged = True
            while merged:
                
                joined = []
                not_joined = []
                merged = False
                
                while len(to_join) > 0:
                    bed1, to_join = to_join[0], to_join[1:]
                    intervals1 = bed1.toIntervals()
                    for bed2 in to_join:
                        intervals2 = bed2.toIntervals()
                        if Intervals.calculateOverlap(intervals1, intervals2) > 0:
                            intervals = Intervals.combine(intervals1 +
                                                          intervals2)
                            bed1.fromIntervals(intervals)
                            bed1["score"] += bed2["score"]
                            merged = True
                        else:
                            not_joined.append(bed2)

                    joined.append(bed1)
                    to_join = not_joined
                    not_joined = []

                to_join = joined
                joined = []
                
            to_join = sorted(to_join, key=lambda x: int(x.start))
            
            # keep only those with the created from the merge of the minimum
            # number of intervals
            
            for bed in to_join:

                if bed["score"] < min_intervals:
                    c.skipped_min_intervals += 1
                    continue

                yield bed
                c.output += 1
        else:
                        
            if len(to_join) < min_intervals:
                c.skipped_min_intervals += 1
                continue

            a = to_join[0]
            a.end = max([entry.end for entry in to_join])
            a.score = len(to_join)
            yield a
            c.output += 1

    E.info(str(c))
예제 #4
0
def merge(iterator,
          max_distance=0,
          by_name=False,
          min_intervals=1,
          remove_inconsistent=False,
          resolve_blocks=False,
          stranded=False):
    """iterator for merging adjacent bed entries.

    *max_distance* > 0 permits merging of intervals that are
    not directly adjacent.

    If *by_name = True*, only entries with the same name are merged.

    If *remove_inconsistent*, overlapping intervals where the names
    are inconsistent will be removed.

    The score gives the number of intervals that have been merged.
    """

    if remove_inconsistent and by_name:
        assert ValueError(
            "using both remove_inconsistent and by_name makes no sense")

    def iterate_chunks(iterator):
        max_end = defaultdict(int)
        to_join = defaultdict(list)
        last_name = defaultdict(str)

        last = next(iterator)

        if not stranded:
            strand = "."
        else:
            strand = last.strand

        max_end[strand] = last.end
        to_join[strand] = [last]

        for bed in iterator:

            if not stranded:
                strand = "."
            else:
                strand = bed.strand

            d = bed.start - max_end[strand]

            if bed.contig == last.contig:
                assert bed.start >= last.start, \
                    "input file should be sorted by contig and position: d=%i:\n%s\n%s\n" \
                    % (d, last, bed)

            if bed.contig != last.contig:

                for s in to_join:
                    if to_join[s]:
                        yield to_join[s]
                    to_join[s] = []
                    max_end[s] = 0

            elif (d > max_distance or (by_name and last_name[strand]
                                       and last_name[strand] != bed.name)):

                if to_join[strand]:
                    yield to_join[strand]

                to_join[strand] = list()

            last = bed
            last_name[strand] = last.name
            max_end[strand] = max(bed.end, max_end[strand])
            to_join[strand].append(bed)

        for strand in sorted(to_join):
            if to_join[strand]:
                yield to_join[strand]
        raise StopIteration

    c = E.Counter()

    for to_join in iterate_chunks(iterator):

        c.input += 1

        if remove_inconsistent:
            names = set([x.name for x in to_join])
            if len(names) > 1:
                c.skipped_inconsistent_intervals += 1
                continue

        if resolve_blocks:
            # keep track of number of intervals in each entry
            for bed in to_join:
                bed["score"] = 1
            merged = True
            while merged:
                joined = []
                not_joined = []
                merged = False

                while len(to_join) > 0:
                    bed1, to_join = to_join[0], to_join[1:]
                    intervals1 = bed1.toIntervals()
                    for bed2 in to_join:
                        intervals2 = bed2.toIntervals()
                        if Intervals.calculateOverlap(intervals1,
                                                      intervals2) > 0:
                            intervals = Intervals.combine(intervals1 +
                                                          intervals2)
                            bed1.fromIntervals(intervals)
                            bed1["score"] += bed2["score"]
                            merged = True
                        else:
                            not_joined.append(bed2)

                    joined.append(bed1)
                    to_join = not_joined
                    not_joined = []

                to_join = joined
                joined = []

            to_join = sorted(to_join, key=lambda x: int(x.start))

            # keep only those with the created from the merge of the minimum
            # number of intervals

            for bed in to_join:

                if bed["score"] < min_intervals:
                    c.skipped_min_intervals += 1
                    continue

                yield bed
                c.output += 1
        else:

            if len(to_join) < min_intervals:
                c.skipped_min_intervals += 1
                continue

            a = to_join[0]
            a.end = max([entry.end for entry in to_join])
            a.score = len(to_join)
            yield a
            c.output += 1

    E.info(str(c))