Example #1
0
    def __init__(self, segments=None, uri=None):

        super(Timeline, self).__init__()

        # sorted set of segments (as an augmented red-black tree)
        segments = [s for s in segments if s] if segments else []
        self._segments = SortedSet(items=segments,
                                   key_type=(float, float),
                                   updator=TimelineUpdator)

        # path to (or any identifier of) segmented resource
        self.uri = uri
Example #2
0
def find_ranges(needle, genome, max_edit):
    """
    Finds all matches in the FM-index genome for the needle within max_edit edit distance
    """
    #Calculate the biggest substring that must exist in the haystack if the needle matches
    # with the edit distance. This assumes worst case distribution of deletion-edits.
    k = (len(needle) - max_edit) / (max_edit + 1)

    #Generate a generator of all substrings of length k.
    kmers = (i for i in xrange(len(needle) - k + 1))

    #Skip all the hard work if the edit distance is zero.
    if not max_edit:
        return [(h, 0) for h in genome.search(needle)], 0

    #Find where all of these kmers match exactly
    kmerhits = (genome.search(needle[kmer:kmer + k]) for kmer in kmers)

    #Create all a sorted set of intervals
    ranges = SortedSet(updator=OverlappingIntervalsUpdator)

    #Iterate over all of the kmers and kmers matches
    for i, kmer in enumerate(kmerhits):
        for hit in kmer:
            #Check for any existing possible ranges that are already in our list
            overlaps = ranges.overlap_point(hit)
            if not len(overlaps):
                #Create a new range consisting of the worst-case given the position of the kmer in the needle
                ranges.add([hit - i - max_edit, hit - i + len(needle) + max_edit, 1])
            else:
                #Incrememnt the number of kmers in each possible range
                for overlap in overlaps:
                    overlap[2] += 1

    best_edit = max_edit

    #Iterate over all potential alignments and use dynamic programming to determine whether it actually
    #fits within edit distance
    #Possible matches defined as the ranges with > max_edit distance kmers found in the match
    alignments = []
    for potential_alignment in (r for r in ranges if r[2] > max_edit):
        ed, start = align_glocal(needle, genome.seq[potential_alignment[0]:potential_alignment[1]])
        if ed < best_edit:
            best_edit = ed
            alignments = []
        alignments.append((potential_alignment[0] + start, ed))

    return alignments, best_edit
Example #3
0
class SingleVertexGraph(object):
    def __init__(self, vertex_id):
        def compare(x, y):
            if self.subgraphs[x].count() == self.subgraphs[y].count():
                if x < y:
                    return -1
                elif x > y:
                    return 1
                else:
                    return 0
            elif self.subgraphs[x].count() < self.subgraphs[y].count():
                return -1
            else:
                return 1
        self.id = vertex_id
        self.is_single = True
        self.edges = {}
        self.subgraphs = {vertex_id: self}
        self.max_weight = 0.
        self.subgraphs_order = SortedSet(compare=compare)
        self.subgraphs_order.add(vertex_id)

    def distance_matrix(self):
        self.distance_matrix = np.zeros((1, 1))

    def loss(self):
        pass

    def loss_change(self, vertex_id, neighbour_id):
        pass

    def reduce(self, min_elems=100):
        pass

    def simplify_edges(self):
        pass

    def count(self):
        return 1

    def pickle_graph(self, path):
        with open(path + 'graph' + str(self.id) + '.p', 'wb') as f:
            pickle.dump( (self.distance_matrix, self.subgraphs.keys()), f)
Example #4
0
    def __init__(self, segments=None, uri=None):

        super(Timeline, self).__init__()

        # sorted set of segments (as an augmented red-black tree)
        segments = [s for s in segments if s] if segments else []
        self._segments = SortedSet(items=segments,
                                   key_type=(float, float),
                                   updator=TimelineUpdator)

        # path to (or any identifier of) segmented resource
        self.uri = uri
Example #5
0
    def establish_subgraphs_order(self):
        def compare(x, y):
            if self.subgraphs[x].count() == self.subgraphs[y].count():
                if x < y:
                    return -1
                elif x > y:
                    return 1
                else:
                    return 0
            elif self.subgraphs[x].count() < self.subgraphs[y].count():
                return -1
            else:
                return 1

        self.subgraphs_order = SortedSet(self.subgraphs.keys(), compare=compare)
Example #6
0
 def __init__(self, vertex_id):
     def compare(x, y):
         if self.subgraphs[x].count() == self.subgraphs[y].count():
             if x < y:
                 return -1
             elif x > y:
                 return 1
             else:
                 return 0
         elif self.subgraphs[x].count() < self.subgraphs[y].count():
             return -1
         else:
             return 1
     self.id = vertex_id
     self.is_single = True
     self.edges = {}
     self.subgraphs = {vertex_id: self}
     self.max_weight = 0.
     self.subgraphs_order = SortedSet(compare=compare)
     self.subgraphs_order.add(vertex_id)
Example #7
0
class Timeline(object):
    """
    Ordered set of segments.

    A timeline can be seen as an ordered set of non-empty segments (Segment).
    Segments can overlap -- though adding an already exisiting segment to a
    timeline does nothing.

    Parameters
    ----------
    segments : Segment iterator, optional
        initial set of segments
    uri : string, optional
        name of segmented resource

    Returns
    -------
    timeline : Timeline
        New timeline

    Examples
    --------
    Create a new empty timeline

        >>> timeline = Timeline()
        >>> if not timeline:
        ...    print "Timeline is empty."
        Timeline is empty.

    Add one segment (+=)

        >>> segment = Segment(0, 1)
        >>> timeline.add(segment)
        >>> if len(timeline) == 1:
        ...    print "Timeline contains only one segment."
        Timeline contains only one segment.

    Add all segments from another timeline

        >>> other_timeline = Timeline([Segment(0.5, 3), Segment(6, 8)])
        >>> timeline.update(other_timeline)

    Get timeline extent, coverage & duration

        >>> extent = timeline.extent()
        >>> print extent
        [0 --> 8]
        >>> coverage = timeline.coverage()
        >>> print coverage
        [
           [0 --> 3]
           [6 --> 8]
        ]
        >>> duration = timeline.duration()
        >>> print "Timeline covers a total of %g seconds." % duration
        Timeline covers a total of 5 seconds.

    Iterate over (sorted) timeline segments

        >>> for segment in timeline:
        ...    print segment
        [0 --> 1]
        [0.5 --> 3]
        [6 --> 8]

    Segmentation

        >>> segmentation = timeline.segmentation()
        >>> print segmentation
        [
           [0 --> 0.5]
           [0.5 --> 1]
           [1 --> 3]
           [6 --> 8]
        ]

    Gaps

        >>> timeline = timeline.copy()
        >>> print timeline
        [
           [0 --> 1]
           [0.5 --> 3]
           [6 --> 8]
        ]
        >>> print timeline.gaps()
        [
           [3 --> 6]
        ]
        >>> segment = Segment(0, 10)
        >>> print timeline.gaps(segment)
        [
           [3 --> 6]
           [8 --> 10]
        ]

    """

    @classmethod
    def from_df(cls, df, uri=None):
        segments = list(df[PYANNOTE_SEGMENT])
        timeline = cls(segments=segments, uri=uri)
        return timeline

    def __init__(self, segments=None, uri=None):

        super(Timeline, self).__init__()

        # sorted set of segments (as an augmented red-black tree)
        segments = [s for s in segments if s] if segments else []
        self._segments = SortedSet(items=segments,
                                   key_type=(float, float),
                                   updator=TimelineUpdator)

        # path to (or any identifier of) segmented resource
        self.uri = uri

    def __len__(self):
        return self._segments.length()

    def __nonzero__(self):
        return self._segments.length() > 0

    def __iter__(self):
        return iter(self._segments)

    def __getitem__(self, k):
        """Returns kth segment"""
        return self._segments.kth(k)

    def __eq__(self, other):
        return self._segments == other._segments

    def __ne__(self, other):
        return self._segments != other._segments

    def index(self, segment):
        """Index of segment

        Parameter
        ---------
        segment : Segment

        Raises
        ------
        ValueError if the segment is not present
        """
        return self._segments.index(segment)

    def add(self, segment):
        """Add segment"""
        if segment:
            self._segments.add(segment)

    def update(self, timeline):
        """Add `timeline` segments"""
        self._segments.update(timeline._segments)

    def union(self, other):
        """Create new timeline made of union of segments"""
        segments = self._segments.union(other._segments)
        return Timeline(segments=segments, uri=self.uri)

    def co_iter(self, other):
        for segment, other_segment in self._segments.co_iter(other._segments):
            yield segment, other_segment

    def crop(self, other, mode='intersection', mapping=False):

        if isinstance(other, Segment):
            other = Timeline(segments=[other], uri=self.uri)
            return self.crop(other, mode=mode, mapping=mapping)

        elif isinstance(other, Timeline):

            if mode == 'loose':
                segments = [segment for segment, _ in self.co_iter(other)]
                return Timeline(segments=segments, uri=self.uri)

            elif mode == 'strict':
                segments = [segment
                            for segment, other_segment in self.co_iter(other)
                            if segment in other_segment]
                return Timeline(segments=segments, uri=self.uri)

            elif mode == 'intersection':
                if mapping:
                    mapping = {}
                    for segment, other_segment in self.co_iter(other):
                        inter = segment & other_segment
                        mapping[inter] = mapping.get(inter, list()) + [segment]
                    return Timeline(segments=mapping, uri=self.uri), mapping
                else:
                    segments = [segment & other_segment
                                for segment, other_segment in self.co_iter(other)]
                    return Timeline(segments=segments, uri=self.uri)

            else:
                raise NotImplementedError("unsupported mode: '%s'" % mode)

    def overlapping(self, timestamp):
        """Get list of segments overlapping `timestamp`"""
        return self._segments.overlapping(timestamp)

    def __str__(self):
        """Human-friendly representation"""

        string = "[\n"
        for segment in self._segments:
            string += "   %s\n" % str(segment)
        string += "]"
        return string

    def __repr__(self):
        return "<Timeline(uri=%s, segments=%s)>" % (self.uri,
                                                    list(self._segments))

    def __contains__(self, included):
        """Inclusion

        Use expression 'segment in timeline' or 'other_timeline in timeline'

        Parameters
        ----------
        included : `Segment` or `Timeline`

        Returns
        -------
        contains : bool
            True if every segment in `included` exists in timeline,
            False otherwise

        """

        if isinstance(included, Segment):
            return included in self._segments

        elif isinstance(included, Timeline):
            return self._segments.issuperset(included._segments)

        else:
            raise TypeError()

    def empty(self):
        """Empty copy of a timeline.

        Examples
        --------

            >>> timeline = Timeline(uri="MyVideo.avi")
            >>> timeline += [Segment(0, 1), Segment(2, 3)]
            >>> empty = timeline.empty()
            >>> print empty.uri
            MyVideo.avi
            >>> print empty
            [
            ]

        """
        return Timeline(uri=self.uri)

    def copy(self, segment_func=None):
        """Duplicate timeline.

        If segment_func is provided, apply it to each segment first.

        Parameters
        ----------
        segment_func : function

        Returns
        -------
        timeline : Timeline
            A (possibly modified) copy of the timeline

        Examples
        --------

            >>> timeline = Timeline(uri="MyVideo.avi")
            >>> timeline += [Segment(0, 1), Segment(2, 3)]
            >>> cp = timeline.copy()
            >>> print cp.uri
            MyVideo.avi
            >>> print cp
            [
               [0 --> 1]
               [2 --> 3]
            ]

        """

        # if segment_func is not provided
        # just add every segment
        if segment_func is None:
            return Timeline(segments=self._segments, uri=self.uri)

        # if is provided
        # apply it to each segment before adding them
        else:
            return Timeline(segments=[segment_func(s) for s in self._segments],
                            uri=self.uri)

    def extent(self):
        """Timeline extent

        The extent of a timeline is the segment of minimum duration that
        contains every segments of the timeline. It is unique, by definition.
        The extent of an empty timeline is an empty segment.

        Returns
        -------
        extent : Segment
            Timeline extent

        Examples
        --------

            >>> timeline = Timeline(uri="MyVideo.avi")
            >>> timeline += [Segment(0, 1), Segment(9, 10)]
            >>> print timeline.extent()
            [0 --> 10]

        """
        return self._segments.extent()

    def coverage(self):
        """Timeline coverage

        The coverage of timeline is the timeline with the minimum number of
        segments with exactly the same time span as the original timeline.
        It is (by definition) unique and does not contain any overlapping
        segments.

        Returns
        -------
        coverage : Timeline
            Timeline coverage

        """

        # make sure URI attribute is kept.
        coverage = Timeline(uri=self.uri)

        # The coverage of an empty timeline is an empty timeline.
        if not self:
            return coverage

        # Principle:
        #   * gather all segments with no gap between them
        #   * add one segment per resulting group (their union |)
        # Note:
        #   Since segments are kept sorted internally,
        #   there is no need to perform an exhaustive segment clustering.
        #   We just have to consider them in their natural order.

        # Initialize new coverage segment
        # as very first segment of the timeline
        new_segment = self._segments.kth(0)

        for segment in self:

            # If there is no gap between new coverage segment and next segment,
            if not (segment ^ new_segment):
                # Extend new coverage segment using next segment
                new_segment |= segment

            # If there actually is a gap,
            else:
                # Add new segment to the timeline coverage
                coverage.add(new_segment)
                # Initialize new coverage segment as next segment
                # (right after the gap)
                new_segment = segment

        # Add new segment to the timeline coverage
        coverage.add(new_segment)

        return coverage

    def duration(self):
        """Timeline duration

        Returns
        -------
        duration : float
            Duration of timeline coverage, in seconds.

        """

        # The timeline duration is the sum of the durations
        # of the segments in the timeline coverage.
        return sum([s.duration for s in self.coverage()])

    def gaps(self, focus=None):
        """Timeline gaps

        Parameters
        ----------
        focus : None, Segment or Timeline

        Returns
        -------
        gaps : Timeline
            Timeline made of all gaps from original timeline, and delimited
            by provided segment or timeline.

        Raises
        ------
        TypeError when `focus` is neither None, Segment nor Timeline

        Examples
        --------

        """
        if focus is None:
            focus = self.extent()

        if not isinstance(focus, (Segment, Timeline)):
            raise TypeError("unsupported operand type(s) for -':"
                            "%s and Timeline." % type(focus).__name__)

        # segment focus
        if isinstance(focus, Segment):

            # starts with an empty timeline
            timeline = self.empty()

            # `end` is meant to store the end time of former segment
            # initialize it with beginning of provided segment `focus`
            end = focus.start

            # focus on the intersection of timeline and provided segment
            for segment in self.crop(focus, mode='intersection').coverage():

                # add gap between each pair of consecutive segments
                # if there is no gap, segment is empty, therefore not added
                timeline.add(Segment(start=end, end=segment.start))

                # keep track of the end of former segment
                end = segment.end

            # add final gap (if not empty)
            timeline.add(Segment(start=end, end=focus.end))

        # other_timeline - timeline
        elif isinstance(focus, Timeline):

            # starts with an empty timeline
            timeline = self.empty()

            # add gaps for every segment in coverage of provided timeline
            for segment in focus.coverage():
                timeline.update(self.gaps(focus=segment))

        return timeline

    def segmentation(self):
        """Non-overlapping timeline

        Create the unique timeline with same coverage and same set of segment
        boundaries as original timeline, but with no overlapping segments.

        A picture is worth a thousand words:

            Original timeline:
            |------|    |------|     |----|
              |--|    |-----|     |----------|

            Non-overlapping timeline
            |-|--|-|  |-|---|--|  |--|----|--|

        Returns
        -------
        timeline : Timeline

        Examples
        --------

            >>> timeline = Timeline()
            >>> timeline += [Segment(0, 1), Segment(1, 2), Segment(2,3)]
            >>> timeline += [Segment(2, 4), Segment(6, 7)]
            >>> print timeline.segmentation()
            [
               [0 --> 1]
               [1 --> 2]
               [2 --> 3]
               [3 --> 4]
               [6 --> 7]
            ]

        """
        # COMPLEXITY: O(n)
        coverage = self.coverage()

        # COMPLEXITY: O(n.log n)
        # get all boundaries (sorted)
        # |------|    |------|     |----|
        #   |--|    |-----|     |----------|
        # becomes
        # | |  | |  | |   |  |  |  |    |  |
        timestamps = set([])
        for (start, end) in self:
            timestamps.add(start)
            timestamps.add(end)
        timestamps = sorted(timestamps)

        # create new partition timeline
        # | |  | |  | |   |  |  |  |    |  |
        # becomes
        # |-|--|-|  |-|---|--|  |--|----|--|

        # start with an empty copy
        timeline = Timeline(uri=self.uri)

        if len(timestamps) > 0:
            segments = []
            start = timestamps[0]
            for end in timestamps[1:]:

                # only add segments that are covered by original timeline
                segment = Segment(start=start, end=end)
                if segment and coverage.overlapping(segment.middle):
                    segments.append(segment)
                # next segment...

                start = end

            timeline._segments.update(segments)

        return timeline

    def for_json(self):
        data = {PYANNOTE_JSON: self.__class__.__name__}
        data[PYANNOTE_JSON_CONTENT] = [s.for_json() for s in self]

        if self.uri:
            data[PYANNOTE_URI] = self.uri

        return data

    @classmethod
    def from_json(cls, data):
        uri = data.get(PYANNOTE_URI, None)
        segments = [Segment.from_json(s) for s in data[PYANNOTE_JSON_CONTENT]]
        return cls(segments=segments, uri=uri)

    def _repr_png_(self):
        from pyannote.core.notebook import repr_timeline
        return repr_timeline(self)
Example #8
0
 def test_construct_bytes(self):
     with self.assertRaises(TypeError):
         t = SortedSet([u('0'), u('3'), u('1')], key_type=binary_type)
     t = SortedSet([b'0', b'3', b'1'], key_type=binary_type)
     assert b'0' in t
Example #9
0
 def test_construct_unicode(self):
     with self.assertRaises(TypeError):
         t = SortedSet([b'0', b'3', b'1'], key_type=text_type)
     t = SortedSet([u('0'), u('3'), u('1')], key_type=text_type)
     assert u('0') in t
Example #10
0
class NewGraph(object):
    ID = -1

    def __init__(self, vertices_map=None, edges=None, subgraphs=None, subgraphs_order=None):
        if vertices_map is None:
            self.subgraphs = subgraphs
            self.edges = edges
            self.subgraphs_order = subgraphs_order
        else:
            self.wrap_with_subgraphs(vertices_map)
            self.establish_subgraphs_order()
        self.is_single = False
        self.build_loss_parameters()

        self.id = NewGraph.ID
        NewGraph.ID -= 1
        self.merged = []
    
    def count(self):
        return len(self.subgraphs)

    def establish_subgraphs_order(self):
        def compare(x, y):
            if self.subgraphs[x].count() == self.subgraphs[y].count():
                if x < y:
                    return -1
                elif x > y:
                    return 1
                else:
                    return 0
            elif self.subgraphs[x].count() < self.subgraphs[y].count():
                return -1
            else:
                return 1

        self.subgraphs_order = SortedSet(self.subgraphs.keys(), compare=compare)

    def wrap_with_subgraphs(self, vertices_map):
        self.subgraphs = {}
        self.edges = {}
        for vertex_id in vertices_map:
            self.edges[vertex_id] = {}
            for key in vertices_map[vertex_id]['similars']:
                weight = vertices_map[vertex_id]['similars'][key]
                self.edges[vertex_id][key] = [(vertex_id, key, weight)]
            del(vertices_map[vertex_id]['similars'])
            new_subgraph = SingleVertexGraph(vertex_id)
            self.subgraphs[new_subgraph.id] = new_subgraph

    def build_loss_parameters(self):
        self.within_cluster_distance = 0.
        self.singular_vertices_no = len(self.subgraphs)
        self.between_cluster_distance = 0.
        self.max_weight = 0.
        for from_v in self.edges:
            for to_w in self.edges[from_v]:
                edge_list = self.edges[from_v][to_w]
                edge_weights = map(lambda x: x[2], edge_list)
                self.max_weight = max(max(edge_weights), self.max_weight)
                self.between_cluster_distance +=sum(edge_weights)
        self.between_cluster_distance /= 2.

    def loss(self):
        wcd = self.within_cluster_distance
        svp = self.max_weight * self.singular_vertices_no
        bcd = self.between_cluster_distance
        # pomysl loss * self.vertices[self.vertices_order[-1]].count()
        return (wcd + svp) * bcd

    def reduce(self, min_elems=100):
        improvable = True
        while improvable and self.count() > min_elems:
            improvable = False
            improvement_point = None
            loss = self.loss()

            for subgraph_id in self.subgraphs_order:
                subgraph = self.subgraphs[subgraph_id]
                for neighbour_id in self.edges[subgraph_id]: 
                    new_loss = self.loss_after_merge(subgraph_id, neighbour_id)

                    if new_loss < loss:
                        new_subgraph_id = self.merge_subgraphs(subgraph_id, neighbour_id)
                        improvement_point = (subgraph_id, neighbour_id, new_subgraph_id)
                        break
                if improvement_point:
                    improvable = True
                    break
        print "Reducing subgraphs..."
        for subgraph_id in self.subgraphs:
            self.subgraphs[subgraph_id].reduce(min_elems=min_elems)
        print "Reduction complete!"
        self.simplify_edges()
        print "Simplified edges..."

    def loss_after_merge(self, subgraph_id, neighbour_id):
        weight = self.edges[subgraph_id][neighbour_id][-1][2]
        max_e = lambda v: self.subgraphs[v].max_weight
        plus_candidate = max(max_e(subgraph_id), max_e(neighbour_id))
        plus_candidate = max(plus_candidate, weight)
        delta_wcd = plus_candidate - max_e(subgraph_id) - max_e(neighbour_id)

        delta_svp = 0
        if self.subgraphs[subgraph_id].is_single:
            delta_svp -= 1
        if self.subgraphs[neighbour_id].is_single:
            delta_svp -= 1
        
        edge_weight = lambda edges: edges[0][2]
        min_weight_sum = lambda v: sum([edge_weight(self.edges[v][w]) for w in self.edges[v]])
        delta_bcd = weight - min_weight_sum(subgraph_id) - min_weight_sum(neighbour_id)
        new_edges = self.adjacent_edges(subgraph_id, neighbour_id)
        delta_bcd += sum(new_edges.values())

        new_wcd = self.within_cluster_distance + delta_wcd
        new_svp = (self.singular_vertices_no + delta_svp) * self.max_weight
        new_bcd = self.between_cluster_distance + delta_bcd
        return (new_wcd + new_svp) * new_bcd

    def adjacent_edges(self, subgraph_id, neighbour_id):
        adjacent_edges = {k: self.edges[subgraph_id][k][0][2] for k in
                self.edges[subgraph_id]}
        neighbour_similars = self.edges[neighbour_id]
        for neighbour_adjacent in neighbour_similars:
            current_edge_weight = neighbour_similars[neighbour_adjacent][0][2]
            if neighbour_adjacent in adjacent_edges:
                existing_edge_weight = adjacent_edges[neighbour_adjacent]
                adjacent_edges[neighbour_adjacent] = min(existing_edge_weight, 
                                                         current_edge_weight)
            else:
                adjacent_edges[neighbour_adjacent] = current_edge_weight 
        return adjacent_edges

    def extract_edges(self, subgraph_id, neighbour_id):
        edges_between = self.edges[subgraph_id][neighbour_id]
        edges_between_dict = {}
        for (u, v, w) in edges_between:
            if u not in edges_between_dict:
                edges_between_dict[u] = {}
            if v not in edges_between_dict[u]:
                edges_between_dict[u][v] = []
            edges_between_dict[u][v].append( (u, v, w) )
        return edges_between_dict

    def simplify_edges(self):
        for v1 in self.edges:
            for v2 in self.edges[v1]:
                self.edges[v1][v2] = self.edges[v1][v2][0][2]

    def merge_subgraphs(self, subgraph_id, neighbour_id):
        def compare(x, y):
            if new_subgraphs[x].count() == new_subgraphs[y].count():
                if x < y:
                    return -1
                elif x > y:
                    return 1
                else:
                    return 0
            elif new_subgraphs[x].count() < new_subgraphs[y].count():
                return -1
            else:
                return 1
        edges_from_left = self.subgraphs[subgraph_id].edges
        edges_from_right = self.subgraphs[neighbour_id].edges
        edges_left_to_right = self.extract_edges(subgraph_id, neighbour_id)
        edges_right_to_left = self.extract_edges(neighbour_id, subgraph_id)
 
        new_graph_edges = {}
        for v1 in edges_left_to_right:
            if v1 not in new_graph_edges:
                new_graph_edges[v1] = {}
            for v2 in edges_left_to_right[v1]:
                new_graph_edges[v1][v2] = edges_left_to_right[v1][v2]

        for v1 in edges_right_to_left:
            if v1 not in new_graph_edges:
                new_graph_edges[v1] = {}
            for v2 in edges_right_to_left[v1]:
                new_graph_edges[v1][v2] = edges_right_to_left[v1][v2]

        for v1 in edges_from_left:
            if v1 not in new_graph_edges:
                new_graph_edges[v1] = {}
            for v2 in edges_from_left[v1]:
                if v2 not in new_graph_edges[v1]:
                    new_graph_edges[v1][v2] = []
                new_graph_edges[v1][v2] += edges_from_left[v1][v2]

        for v1 in edges_from_right:
            if v1 not in new_graph_edges:
                new_graph_edges[v1] = {}
            for v2 in edges_from_right[v1]:
                if v2 not in new_graph_edges[v1]:
                    new_graph_edges[v1][v2] = []
                new_graph_edges[v1][v2] += edges_from_right[v1][v2]
        new_subgraphs = {}
        for sg in self.subgraphs[subgraph_id].subgraphs:
            new_subgraphs[sg] = self.subgraphs[subgraph_id].subgraphs[sg]
        for right_subgraph_id in self.subgraphs[neighbour_id].subgraphs:
            new_subgraphs[right_subgraph_id] = self.subgraphs[neighbour_id].subgraphs[right_subgraph_id]

        new_subgraphs_order = list(self.subgraphs[subgraph_id].subgraphs_order)
        new_subgraphs_order += list(self.subgraphs[neighbour_id].subgraphs_order)
        new_subgraphs_order = SortedSet(new_subgraphs_order, compare=compare)
        self.subgraphs_order.remove(subgraph_id)
        self.subgraphs_order.remove(neighbour_id)
        new_graph = NewGraph(subgraphs=new_subgraphs,
                          subgraphs_order=new_subgraphs_order, 
                          edges=new_graph_edges,
                          vertices_map=None)
       
        del(self.edges[subgraph_id][neighbour_id])
        del(self.edges[neighbour_id][subgraph_id])
        
        self.edges[new_graph.id] = {}
        all_neighbours = set(self.edges[subgraph_id].keys()) | set(self.edges[neighbour_id].keys())
        for neigh in all_neighbours:
            left_to_edges = []
            left_from_edges = []
            if neigh in self.edges[subgraph_id]:
                left_to_edges = self.edges[subgraph_id][neigh]
                left_from_edges = self.edges[neigh][subgraph_id]
            right_to_edges = []
            right_from_edges = []
            if neigh in self.edges[neighbour_id]:
                right_to_edges = self.edges[neighbour_id][neigh]
                right_from_edges = self.edges[neigh][neighbour_id]
            to_edges = self.merge_sorted_lists(left_to_edges, right_to_edges)
            from_edges = self.merge_sorted_lists(left_from_edges, right_from_edges)
            self.edges[neigh][new_graph.id] = from_edges
            self.edges[new_graph.id][neigh] = to_edges

        self.subgraphs[new_graph.id] = new_graph
        self.subgraphs_order.add(new_graph.id)
        for other_id in self.edges[subgraph_id]:
            del(self.edges[other_id][subgraph_id])
        for other_id in self.edges[neighbour_id]:
            del(self.edges[other_id][neighbour_id])
        del(self.edges[subgraph_id])
        del(self.edges[neighbour_id])
        del(self.subgraphs[subgraph_id])
        del(self.subgraphs[neighbour_id])
        self.merged += [neighbour_id, subgraph_id]
        return new_graph.id

    def merge_sorted_lists(self, l1, l2):
        result = []
        i = 0
        j = 0
        while i < len(l1) and j < len(l2):
            if l1[i][2] < l2[j][2]:
                result.append(l1[i])
                i += 1
            else:
                result.append(l2[j])
                j += 1
        result += l1[i:] + l2[j:]
        return result

    def distance_matrix(self):
        subgraphs_cnt = len(self.subgraphs)
        dist_mat = np.ndarray(shape=(subgraphs_cnt, subgraphs_cnt), dtype=np.float_)
        dist_mat.fill(float("inf"))
        np.fill_diagonal(dist_mat, 0.0)

        subgraphs_keys = self.subgraphs.keys()
        for index, key in enumerate(subgraphs_keys):
            for index2, key2 in enumerate(subgraphs_keys):
                if index == index2:
                    continue

                if key2 in self.edges[key]:
                    dist_mat[index, index2] = self.edges[key][key2]

        for index_w, w in enumerate(subgraphs_keys):
            for index_v1, v1 in enumerate(subgraphs_keys):
                for index_v2, v2 in enumerate(subgraphs_keys):
                    if v1 == v2 or v1 == w or v2 == w:
                        continue
                    v1w = dist_mat[index_v1, index_w]
                    wv2 = dist_mat[index_w, index_v2]
                    dist_mat[index_v1, index_v2] = min(dist_mat[index_v1, index_v2], v1w + wv2)

        self.edges = {}
        self.distance_matrix = dist_mat
        for sg in self.subgraphs:
            self.subgraphs[sg].distance_matrix()

    def pickle_graph(self, path):
        with open(path + 'graph' + str(self.id) + '.p', 'wb') as f:
            pickle.dump( (self.distance_matrix, self.subgraphs.keys()), f)
        for sg in self.subgraphs:
            self.subgraphs[sg].pickle_graph(path)
Example #11
0
class Timeline(object):
    """
    Ordered set of segments.

    A timeline can be seen as an ordered set of non-empty segments (Segment).
    Segments can overlap -- though adding an already exisiting segment to a
    timeline does nothing.

    Parameters
    ----------
    segments : Segment iterator, optional
        initial set of segments
    uri : string, optional
        name of segmented resource

    Returns
    -------
    timeline : Timeline
        New timeline

    Examples
    --------
    Create a new empty timeline

        >>> timeline = Timeline()
        >>> if not timeline:
        ...    print "Timeline is empty."
        Timeline is empty.

    Add one segment (+=)

        >>> segment = Segment(0, 1)
        >>> timeline.add(segment)
        >>> if len(timeline) == 1:
        ...    print "Timeline contains only one segment."
        Timeline contains only one segment.

    Add all segments from another timeline

        >>> other_timeline = Timeline([Segment(0.5, 3), Segment(6, 8)])
        >>> timeline.update(other_timeline)

    Get timeline extent, coverage & duration

        >>> extent = timeline.extent()
        >>> print extent
        [0 --> 8]
        >>> coverage = timeline.coverage()
        >>> print coverage
        [
           [0 --> 3]
           [6 --> 8]
        ]
        >>> duration = timeline.duration()
        >>> print "Timeline covers a total of %g seconds." % duration
        Timeline covers a total of 5 seconds.

    Iterate over (sorted) timeline segments

        >>> for segment in timeline:
        ...    print segment
        [0 --> 1]
        [0.5 --> 3]
        [6 --> 8]

    Segmentation

        >>> segmentation = timeline.segmentation()
        >>> print segmentation
        [
           [0 --> 0.5]
           [0.5 --> 1]
           [1 --> 3]
           [6 --> 8]
        ]

    Gaps

        >>> timeline = timeline.copy()
        >>> print timeline
        [
           [0 --> 1]
           [0.5 --> 3]
           [6 --> 8]
        ]
        >>> print timeline.gaps()
        [
           [3 --> 6]
        ]
        >>> segment = Segment(0, 10)
        >>> print timeline.gaps(segment)
        [
           [3 --> 6]
           [8 --> 10]
        ]

    """
    def __init__(self, segments=None, uri=None):

        super(Timeline, self).__init__()

        # sorted set of segments (as an augmented red-black tree)
        segments = [s for s in segments if s] if segments else []
        self._segments = SortedSet(items=segments,
                                   key_type=(float, float),
                                   updator=TimelineUpdator)

        # path to (or any identifier of) segmented resource
        self.uri = uri

    def __len__(self):
        return self._segments.length()

    def __nonzero__(self):
        return self._segments.length() > 0

    def __iter__(self):
        return iter(self._segments)

    def __getitem__(self, k):
        """Returns kth segment"""
        return self._segments.kth(k)

    def __eq__(self, other):
        return self._segments == other._segments

    def __ne__(self, other):
        return self._segments != other._segments

    def index(self, segment):
        """Index of segment

        Parameter
        ---------
        segment : Segment

        Raises
        ------
        ValueError if the segment is not present
        """
        return self._segments.index(segment)

    def add(self, segment):
        """Add segment"""
        if segment:
            self._segments.add(segment)

    def update(self, timeline):
        """Add `timeline` segments"""
        self._segments.update(timeline._segments)

    def union(self, other):
        """Create new timeline made of union of segments"""
        segments = self._segments.union(other._segments)
        return Timeline(segments=segments, uri=self.uri)

    def co_iter(self, other):
        for segment, other_segment in self._segments.co_iter(other._segments):
            yield segment, other_segment

    def crop(self, other, mode='intersection', mapping=False):

        if isinstance(other, Segment):
            other = Timeline(segments=[other], uri=self.uri)
            return self.crop(other, mode=mode, mapping=mapping)

        elif isinstance(other, Timeline):

            if mode == 'loose':
                segments = [segment for segment, _ in self.co_iter(other)]
                return Timeline(segments=segments, uri=self.uri)

            elif mode == 'strict':
                segments = [
                    segment for segment, other_segment in self.co_iter(other)
                    if segment in other_segment
                ]
                return Timeline(segments=segments, uri=self.uri)

            elif mode == 'intersection':
                if mapping:
                    mapping = {}
                    for segment, other_segment in self.co_iter(other):
                        inter = segment & other_segment
                        mapping[inter] = mapping.get(inter, list()) + [segment]
                    return Timeline(segments=mapping, uri=self.uri), mapping
                else:
                    segments = [
                        segment & other_segment
                        for segment, other_segment in self.co_iter(other)
                    ]
                    return Timeline(segments=segments, uri=self.uri)

            else:
                raise NotImplementedError("unsupported mode: '%s'" % mode)

    def overlapping(self, timestamp):
        """Get list of segments overlapping `timestamp`"""
        return self._segments.overlapping(timestamp)

    def __str__(self):
        """Human-friendly representation"""

        string = "[\n"
        for segment in self._segments:
            string += "   %s\n" % str(segment)
        string += "]"
        return string

    def __repr__(self):
        return "<Timeline(uri=%s, segments=%s)>" % (self.uri,
                                                    list(self._segments))

    def __contains__(self, included):
        """Inclusion

        Use expression 'segment in timeline' or 'other_timeline in timeline'

        Parameters
        ----------
        included : `Segment` or `Timeline`

        Returns
        -------
        contains : bool
            True if every segment in `included` exists in timeline,
            False otherwise

        """

        if isinstance(included, Segment):
            return included in self._segments

        elif isinstance(included, Timeline):
            return self._segments.issuperset(included._segments)

        else:
            raise TypeError()

    def empty(self):
        """Empty copy of a timeline.

        Examples
        --------

            >>> timeline = Timeline(uri="MyVideo.avi")
            >>> timeline += [Segment(0, 1), Segment(2, 3)]
            >>> empty = timeline.empty()
            >>> print empty.uri
            MyVideo.avi
            >>> print empty
            [
            ]

        """
        return Timeline(uri=self.uri)

    def copy(self, segment_func=None):
        """Duplicate timeline.

        If segment_func is provided, apply it to each segment first.

        Parameters
        ----------
        segment_func : function

        Returns
        -------
        timeline : Timeline
            A (possibly modified) copy of the timeline

        Examples
        --------

            >>> timeline = Timeline(uri="MyVideo.avi")
            >>> timeline += [Segment(0, 1), Segment(2, 3)]
            >>> cp = timeline.copy()
            >>> print cp.uri
            MyVideo.avi
            >>> print cp
            [
               [0 --> 1]
               [2 --> 3]
            ]

        """

        # if segment_func is not provided
        # just add every segment
        if segment_func is None:
            return Timeline(segments=self._segments, uri=self.uri)

        # if is provided
        # apply it to each segment before adding them
        else:
            return Timeline(segments=[segment_func(s) for s in self._segments],
                            uri=self.uri)

    def extent(self):
        """Timeline extent

        The extent of a timeline is the segment of minimum duration that
        contains every segments of the timeline. It is unique, by definition.
        The extent of an empty timeline is an empty segment.

        Returns
        -------
        extent : Segment
            Timeline extent

        Examples
        --------

            >>> timeline = Timeline(uri="MyVideo.avi")
            >>> timeline += [Segment(0, 1), Segment(9, 10)]
            >>> print timeline.extent()
            [0 --> 10]

        """
        return self._segments.extent()

    def coverage(self):
        """Timeline coverage

        The coverage of timeline is the timeline with the minimum number of
        segments with exactly the same time span as the original timeline.
        It is (by definition) unique and does not contain any overlapping
        segments.

        Returns
        -------
        coverage : Timeline
            Timeline coverage

        """

        # make sure URI attribute is kept.
        coverage = Timeline(uri=self.uri)

        # The coverage of an empty timeline is an empty timeline.
        if not self:
            return coverage

        # Principle:
        #   * gather all segments with no gap between them
        #   * add one segment per resulting group (their union |)
        # Note:
        #   Since segments are kept sorted internally,
        #   there is no need to perform an exhaustive segment clustering.
        #   We just have to consider them in their natural order.

        # Initialize new coverage segment
        # as very first segment of the timeline
        new_segment = self._segments.kth(0)

        for segment in self:

            # If there is no gap between new coverage segment and next segment,
            if not (segment ^ new_segment):
                # Extend new coverage segment using next segment
                new_segment |= segment

            # If there actually is a gap,
            else:
                # Add new segment to the timeline coverage
                coverage.add(new_segment)
                # Initialize new coverage segment as next segment
                # (right after the gap)
                new_segment = segment

        # Add new segment to the timeline coverage
        coverage.add(new_segment)

        return coverage

    def duration(self):
        """Timeline duration

        Returns
        -------
        duration : float
            Duration of timeline coverage, in seconds.

        """

        # The timeline duration is the sum of the durations
        # of the segments in the timeline coverage.
        return sum([s.duration for s in self.coverage()])

    def gaps(self, focus=None):
        """Timeline gaps

        Parameters
        ----------
        focus : None, Segment or Timeline

        Returns
        -------
        gaps : Timeline
            Timeline made of all gaps from original timeline, and delimited
            by provided segment or timeline.

        Raises
        ------
        TypeError when `focus` is neither None, Segment nor Timeline

        Examples
        --------

        """
        if focus is None:
            focus = self.extent()

        if not isinstance(focus, (Segment, Timeline)):
            raise TypeError("unsupported operand type(s) for -':"
                            "%s and Timeline." % type(focus).__name__)

        # segment focus
        if isinstance(focus, Segment):

            # starts with an empty timeline
            timeline = self.empty()

            # `end` is meant to store the end time of former segment
            # initialize it with beginning of provided segment `focus`
            end = focus.start

            # focus on the intersection of timeline and provided segment
            for segment in self.crop(focus, mode='intersection').coverage():

                # add gap between each pair of consecutive segments
                # if there is no gap, segment is empty, therefore not added
                timeline.add(Segment(start=end, end=segment.start))

                # keep track of the end of former segment
                end = segment.end

            # add final gap (if not empty)
            timeline.add(Segment(start=end, end=focus.end))

        # other_timeline - timeline
        elif isinstance(focus, Timeline):

            # starts with an empty timeline
            timeline = self.empty()

            # add gaps for every segment in coverage of provided timeline
            for segment in focus.coverage():
                timeline.update(self.gaps(focus=segment))

        return timeline

    def segmentation(self):
        """Non-overlapping timeline

        Create the unique timeline with same coverage and same set of segment
        boundaries as original timeline, but with no overlapping segments.

        A picture is worth a thousand words:

            Original timeline:
            |------|    |------|     |----|
              |--|    |-----|     |----------|

            Non-overlapping timeline
            |-|--|-|  |-|---|--|  |--|----|--|

        Returns
        -------
        timeline : Timeline

        Examples
        --------

            >>> timeline = Timeline()
            >>> timeline += [Segment(0, 1), Segment(1, 2), Segment(2,3)]
            >>> timeline += [Segment(2, 4), Segment(6, 7)]
            >>> print timeline.segmentation()
            [
               [0 --> 1]
               [1 --> 2]
               [2 --> 3]
               [3 --> 4]
               [6 --> 7]
            ]

        """
        # COMPLEXITY: O(n)
        coverage = self.coverage()

        # COMPLEXITY: O(n.log n)
        # get all boundaries (sorted)
        # |------|    |------|     |----|
        #   |--|    |-----|     |----------|
        # becomes
        # | |  | |  | |   |  |  |  |    |  |
        timestamps = set([])
        for (start, end) in self:
            timestamps.add(start)
            timestamps.add(end)
        timestamps = sorted(timestamps)

        # create new partition timeline
        # | |  | |  | |   |  |  |  |    |  |
        # becomes
        # |-|--|-|  |-|---|--|  |--|----|--|

        # start with an empty copy
        timeline = Timeline(uri=self.uri)

        if len(timestamps) > 0:
            segments = []
            start = timestamps[0]
            for end in timestamps[1:]:

                # only add segments that are covered by original timeline
                segment = Segment(start=start, end=end)
                if segment and coverage.overlapping(segment.middle):
                    segments.append(segment)
                # next segment...

                start = end

            timeline._segments.update(segments)

        return timeline

    def for_json(self):
        data = {PYANNOTE_JSON_TIMELINE: [s.for_json() for s in self]}
        if self.uri:
            data[PYANNOTE_URI] = self.uri
        return data

    @classmethod
    def from_json(cls, data):
        segments = [Segment.from_json(s) for s in data[PYANNOTE_JSON_TIMELINE]]
        uri = data.get(PYANNOTE_URI, None)
        return cls(segments=segments, uri=uri)

    def _repr_png_(self):
        from pyannote.core.notebook import repr_timeline
        return repr_timeline(self)