Ejemplo n.º 1
0
    def __init__(self, num_elves):
        self.elves = []
        for i in xrange(1, num_elves + 1):
            elf = Elf(i)
            heapq.heappush(self.elves, (elf.next_available_time, elf))

        self.pending_toys = SortedCollection(key=itemgetter(1))
Ejemplo n.º 2
0
    def make_label_index(self, stream_item):
        'make a sortedcollection on body.labels'
        labels = stream_item.body.labels.get(self.config.get('annotator_id'))
        if not labels:
            labels = []

        self.label_index = SortedCollection(
            labels, key=lambda label: label.offsets[OffsetType.BYTES].first)
Ejemplo n.º 3
0
def on_orderbook_snapshot(data):
    order_books[symbol]['bids'] = SortedCollection([[float(pq) for pq in bid]
                                                    for bid in data['bids']],
                                                   key=itemgetter(0))

    order_books[symbol]['asks'] = SortedCollection([[float(pq) for pq in ask]
                                                    for ask in data['asks']],
                                                   key=itemgetter(0))
Ejemplo n.º 4
0
    def __init__(self, markets: list, ticker_observer: TickerObserver):
        super().__init__(markets, ticker_observer)
        self.mapping = {}  # maps pairs to bitfinex channel ids

        for m in markets:
            self.order_books[m] = {
                'bids': SortedCollection(key=itemgetter(0)),
                'asks': SortedCollection(key=itemgetter(0))
            }
Ejemplo n.º 5
0
    def __init__(self, timeline=None):
        """ Initialize.

        Kwargs:
            timeline (list of Actions): a timeline of all Replay Actions
        """
        if timeline is not None:
            self.timeline = timeline
        else:
            self.timeline = SortedCollection(key=attrgetter('timestamp'))

        self.repSets = []  #TODO: Define repset class
Ejemplo n.º 6
0
def _offset_labels(stream_item, aligner_data, offset_type='BYTES'):
    ## get a set of tokens -- must have OffsetType.<offset_type> type offsets.

    offset_type = OffsetType._NAMES_TO_VALUES[offset_type]

    sentences = stream_item.body.sentences[aligner_data['tagger_id']]

    ## These next few steps are probably the most
    ## memory intensive, because they fully
    ## instantiate all the tokens.

    token_collection = SortedCollection(
        itertools.chain(*[sent.tokens for sent in sentences]),
        key=lambda tok: tok.offsets[offset_type].first
        )

    ## if labels on ContentItem, then make labels on Tokens
    for annotator_id in stream_item.body.labels:
        if annotator_id != aligner_data['annotator_id']:
            continue
        for label in stream_item.body.labels[annotator_id]:

            ## remove the offset from the label, because we are
            ## putting it into the token
            label_off = label.offsets.pop( offset_type )

            assert label_off.length == len(label_off.value)

            #print 'L: %d\t%r\t%r' % (label_off.first, label_off.value,
            #                         '\n'.join(hope_original.split('\n')[label_off.first:label_off.first+label_off.length]))

            #print 'tc %d %r' % (len(token_collection), token_collection._keys)
            #print 'label_off.first=%d, length=%d, value=%r' % (label_off.first, label_off.length, label_off.value)

            toks = token_collection.find_range(
                    label_off.first, label_off.first + label_off.length)

            #print "find_le: ", token_collection.find_le(label_off.first)

            toks = list(toks)
            #print 'aligned tokens', toks

            for tok in toks:
                add_annotation(tok, label)

                ## only for debugging
                assert tok.token is not None, tok.token

                if not tok.token in label_off.value:
                    sys.exit('%r not in %r' % \
                        ([(t.offsets[offset_type].first, t.token)
                          for t in toks],
                         label_off.value))
Ejemplo n.º 7
0
def _offset_labels(stream_item, aligner_data, offset_type='BYTES'):
    ## get a set of tokens -- must have OffsetType.<offset_type> type offsets.

    offset_type = OffsetType._NAMES_TO_VALUES[offset_type]

    sentences = stream_item.body.sentences[aligner_data['tagger_id']]

    ## These next few steps are probably the most
    ## memory intensive, because they fully
    ## instantiate all the tokens.

    token_collection = SortedCollection(
        itertools.chain(*[sent.tokens for sent in sentences]),
        key=lambda tok: tok.offsets[offset_type].first)

    ## if labels on ContentItem, then make labels on Tokens
    for annotator_id in stream_item.body.labels:
        if annotator_id != aligner_data['annotator_id']:
            continue
        for label in stream_item.body.labels[annotator_id]:

            ## remove the offset from the label, because we are
            ## putting it into the token
            label_off = label.offsets.pop(offset_type)

            assert label_off.length == len(label_off.value)

            #print 'L: %d\t%r\t%r' % (label_off.first, label_off.value,
            #                         '\n'.join(hope_original.split('\n')[label_off.first:label_off.first+label_off.length]))

            #print 'tc %d %r' % (len(token_collection), token_collection._keys)
            #print 'label_off.first=%d, length=%d, value=%r' % (label_off.first, label_off.length, label_off.value)

            toks = token_collection.find_range(
                label_off.first, label_off.first + label_off.length)

            #print "find_le: ", token_collection.find_le(label_off.first)

            toks = list(toks)
            #print 'aligned tokens', toks

            for tok in toks:
                add_annotation(tok, label)

                ## only for debugging
                assert tok.token is not None, tok.token

                if not tok.token in label_off.value:
                    sys.exit('%r not in %r' % \
                        ([(t.offsets[offset_type].first, t.token)
                          for t in toks],
                         label_off.value))
Ejemplo n.º 8
0
    def __init__(self, flow):
        '''
        Sets things up for adding packets.

        Args:
        flow = tcp.Flow
        '''
        self.finished = False
        self.flow = flow
        self.arrival_data = SortedCollection(key=itemgetter(0))
        self.final_arrival_data = SortedCollection(key=itemgetter(0))
        self.final_arrival_pointer = None
        self.chunks = []
        self.final_data_chunk = None
Ejemplo n.º 9
0
 def finish(self):
     '''
 Notifies the direction that there are no more packets coming. This means
 that self.data can be decided upon, and arrival_data can be converted to
 a SortedCollection for querying
 '''
     # set data to the data from the first chunk, if there is one
     if self.chunks:
         self.data = self.chunks[0].data
         self.seq_start = self.chunks[0].seq_start
     else:
         self.data = ''
     self.arrival_data = SortedCollection(self.arrival_data,
                                          key=lambda v: v[0])
Ejemplo n.º 10
0
    def __init__(self, flow):
        '''
        Sets things up for adding packets.

        Args:
        flow = tcp.Flow
        '''
        self.finished = False
        self.flow = flow
        self.arrival_data = SortedCollection(key=itemgetter(0))
        self.final_arrival_data = SortedCollection(key=itemgetter(0))
        self.final_arrival_pointer = None
        self.chunks = []
        self.final_data_chunk = None
Ejemplo n.º 11
0
    def make_label_index(self, stream_item):
        "make a sortedcollection on body.labels"
        labels = stream_item.body.labels.get(self.config.get("annotator_id"))
        if not labels:
            labels = []

        self.label_index = SortedCollection(labels, key=lambda label: label.offsets[OffsetType.BYTES].first)
Ejemplo n.º 12
0
 def calculate_final_arrivals(self):
     '''
 make self.final_arrival_data a SortedCollection. Final arrival
 for a sequence number is when that sequence number of data and all the
 data before it have arrived, that is, when the data is usable by the
 application. Must be called after self.finish().
 '''
     self.final_arrival_data = []
     peak_time = 0.0
     # final arrival vertex always coincides with an arrival vertex
     for vertex in self.arrival_data:
         if vertex[1].ts > peak_time:
             peak_time = vertex[1].ts
             self.final_arrival_data.append((vertex[0], vertex[1].ts))
     self.final_arrival_data = SortedCollection(self.final_arrival_data,
                                                key=lambda v: v[0])
Ejemplo n.º 13
0
    def __init__(self, num_elves):
        self.elves = []
        for i in xrange(1, num_elves+1):
            elf = Elf(i)
            heapq.heappush(self.elves, (elf.next_available_time, elf))

        self.pending_toys = SortedCollection(key=itemgetter(1))
Ejemplo n.º 14
0
    def __init__(self, timeline=None):
        """ Initialize.

        Kwargs:
            timeline (list of Actions): a timeline of all Replay Actions
        """
        if timeline is not None:
            self.timeline = timeline
        else:
            self.timeline = SortedCollection(key=attrgetter('timestamp'))

        self.repSets = [] #TODO: Define repset class
Ejemplo n.º 15
0
class Replay(object):
    """ A Replay object """

    def __init__(self, timeline=None):
        """ Initialize.

        Kwargs:
            timeline (list of Actions): a timeline of all Replay Actions
        """
        if timeline is not None:
            self.timeline = timeline
        else:
            self.timeline = SortedCollection(key=attrgetter('timestamp'))

        self.repSets = [] #TODO: Define repset class

    def insertAction(self, action):
        if isinstance(action, Action):
            self.timeline.insert(action)

    def insertActions(self, actionList):
        for action in actionList:
            if isinstance(action, Action):
                self.insertAction(action)

    def playback(self):
        for action in self.timeline:
            print str(action)

    def __eq__(self, other):
        if isinstance(other, Replay):
            # TODO: Is this good enough?
            return self.timeline == other.timeline
        return NotImplemented

    def __ne__(self, other):
        result = self.__eq__(other)
        if result is NotImplemented:
            return result
        return not result
Ejemplo n.º 16
0
class Replay(object):
    """ A Replay object """
    def __init__(self, timeline=None):
        """ Initialize.

        Kwargs:
            timeline (list of Actions): a timeline of all Replay Actions
        """
        if timeline is not None:
            self.timeline = timeline
        else:
            self.timeline = SortedCollection(key=attrgetter('timestamp'))

        self.repSets = []  #TODO: Define repset class

    def insertAction(self, action):
        if isinstance(action, Action):
            self.timeline.insert(action)

    def insertActions(self, actionList):
        for action in actionList:
            if isinstance(action, Action):
                self.insertAction(action)

    def playback(self):
        for action in self.timeline:
            print str(action)

    def __eq__(self, other):
        if isinstance(other, Replay):
            # TODO: Is this good enough?
            return self.timeline == other.timeline
        return NotImplemented

    def __ne__(self, other):
        result = self.__eq__(other)
        if result is NotImplemented:
            return result
        return not result
Ejemplo n.º 17
0
def line_offset_labels(stream_item, aligner_data):
    ## get a set of tokens -- must have OffsetType.LINES in them.
    sentences = stream_item.body.sentences[aligner_data['tagger_id']]

    ## if labels on ContentItem, then make labels on Tokens
    for annotator_id in stream_item.body.labels:
        if annotator_id != aligner_data['annotator_id']:
            continue
        for label in stream_item.body.labels[annotator_id]:

            ## remove the offset from the label, because we are
            ## putting it into the token
            label_off = label.offsets.pop(OffsetType.LINES)

            assert label_off.length == len(label_off.value.split('\n'))
            #print 'L: %d\t%r\t%r' % (label_off.first, label_off.value,
            #    '\n'.join(hope_original.split('\n')[label_off.first:
            #         label_off.first+label_off.length]))

            ## These next few steps are probably the most
            ## memory intensive, because they fully
            ## instantiate all the tokens.
            token_collection = SortedCollection(
                itertools.chain(*[sent.tokens for sent in sentences]),
                key=lambda tok: tok.offsets[OffsetType.LINES].first
                )

            toks = token_collection.find_range(
                    label_off.first, label_off.first + label_off.length)

            for tok in toks:
                add_annotation(tok, label)

                ## only for debugging
                if not tok.token or tok.token not in label_off.value:
                    sys.exit('%r not in %r' % \
                        ([(t.offsets[OffsetType.LINES].first, t.token)
                          for t in toks],
                         label_off.value))
Ejemplo n.º 18
0
 def finish(self):
     '''
     Notifies the direction that there are no more packets coming. This means
     that self.data can be decided upon, and arrival_data can be converted to
     a SortedCollection for querying
     '''
     # set data to the data from the first chunk, if there is one
     if self.chunks:
         self.data = self.chunks[0].data
         self.seq_start = self.chunks[0].seq_start
     else:
         self.data = ''
     self.arrival_data = SortedCollection(self.arrival_data, key=lambda v: v[0])
Ejemplo n.º 19
0
def line_offset_labels(stream_item, aligner_data):
    ## get a set of tokens -- must have OffsetType.LINES in them.
    sentences = stream_item.body.sentences[aligner_data['tagger_id']]

    ## if labels on ContentItem, then make labels on Tokens
    for annotator_id in stream_item.body.labels:
        if annotator_id != aligner_data['annotator_id']:
            continue
        for label in stream_item.body.labels[annotator_id]:

            ## remove the offset from the label, because we are
            ## putting it into the token
            label_off = label.offsets.pop(OffsetType.LINES)

            assert label_off.length == len(label_off.value.split('\n'))
            #print 'L: %d\t%r\t%r' % (label_off.first, label_off.value,
            #    '\n'.join(hope_original.split('\n')[label_off.first:
            #         label_off.first+label_off.length]))

            ## These next few steps are probably the most
            ## memory intensive, because they fully
            ## instantiate all the tokens.
            token_collection = SortedCollection(
                itertools.chain(*[sent.tokens for sent in sentences]),
                key=lambda tok: tok.offsets[OffsetType.LINES].first)

            toks = token_collection.find_range(
                label_off.first, label_off.first + label_off.length)

            for tok in toks:
                add_annotation(tok, label)

                ## only for debugging
                if not tok.token or tok.token not in label_off.value:
                    sys.exit('%r not in %r' % \
                        ([(t.offsets[OffsetType.LINES].first, t.token)
                          for t in toks],
                         label_off.value))
Ejemplo n.º 20
0
    def get_closest_correct(self, word):
        word = tuple(c for c in word)

        # caching variables for speedup
        self.seen = {word: (0., 0)}
        self.change_cache = {}
        self.done = set()
        self.not_done = SortedCollection(key=lambda x: x[1][0])
        self.not_done.insert(self.seen.items()[0])

        while True:
            new_value, new_words = self.__get_closest_for_seen()
            if len(new_words) == 0:
                return None

            correct_words = new_words & self.corrects
            if len(correct_words) > 0:
                return correct_words

            for w in new_words:
                if w not in self.seen:
                    self.seen[w] = new_value
                    self.not_done.insert((w, new_value))
Ejemplo n.º 21
0
class Solution:

    def __init__(self, num_elves):
        self.elves = []
        for i in xrange(1, num_elves+1):
            elf = Elf(i)
            heapq.heappush(self.elves, (elf.next_available_time, elf))

        self.pending_toys = SortedCollection(key=itemgetter(1))

    def solve(self, toys):
        hrs = Hours()
        next_toy = None
        current_time = 540  # Santa's Workshop opens Jan 1, 2014 9:00 (= 540 minutes)

        while True:
            next_elf_time, elf = heapq.heappop(self.elves)
            current_time = max(current_time, next_elf_time)

            if (next_toy != None and next_toy.arrival_minute <= current_time):
                self.pending_toys.insert((next_toy, next_toy.duration))
                next_toy = None

            if (next_toy == None):
                for toy in toys:
                    if (toy.arrival_minute <= current_time):
                        self.pending_toys.insert((toy, toy.duration))
                    else:
                        next_toy = toy
                        break

            if (len(self.pending_toys) == 0 and next_toy == None):
                raise StopIteration()

            if (len(self.pending_toys) == 0):
                current_time = next_toy.arrival_minute
                continue

            remaining_time = hrs.get_remaining_sanctioned_time(current_time)
            
            if (remaining_time == hrs.sanctioned_minutes_per_day and elf.rating >= 4):
                toy, duration = self.pending_toys.pop_le(sys.maxint)
            else:
                try:
                    toy, duration = self.pending_toys.pop_le(remaining_time)
                except ValueError:
                    toy, duration = self.pending_toys.pop_le(sys.maxint)
            
            work_duration = elf.asign_toy(current_time, toy, hrs)
            heapq.heappush(self.elves, (elf.next_available_time, elf))

            yield toy.id, elf.id, current_time, work_duration, elf.rating
Ejemplo n.º 22
0
class Solution:
    def __init__(self, num_elves):
        self.elves = []
        for i in xrange(1, num_elves + 1):
            elf = Elf(i)
            heapq.heappush(self.elves, (elf.next_available_time, elf))

        self.pending_toys = SortedCollection(key=itemgetter(1))

    def solve(self, toys):
        hrs = Hours()
        next_toy = None
        current_time = 540  # Santa's Workshop opens Jan 1, 2014 9:00 (= 540 minutes)

        while True:
            next_elf_time, elf = heapq.heappop(self.elves)
            current_time = max(current_time, next_elf_time)

            if (next_toy != None and next_toy.arrival_minute <= current_time):
                self.pending_toys.insert((next_toy, next_toy.duration))
                next_toy = None

            if (next_toy == None):
                for toy in toys:
                    if (toy.arrival_minute <= current_time):
                        self.pending_toys.insert((toy, toy.duration))
                    else:
                        next_toy = toy
                        break

            if (len(self.pending_toys) == 0 and next_toy == None):
                raise StopIteration()

            if (len(self.pending_toys) == 0):
                current_time = next_toy.arrival_minute
                continue

            remaining_time = hrs.get_remaining_sanctioned_time(current_time)

            if (remaining_time == hrs.sanctioned_minutes_per_day
                    and elf.rating >= 4):
                toy, duration = self.pending_toys.pop_le(sys.maxint)
            else:
                try:
                    toy, duration = self.pending_toys.pop_le(remaining_time)
                except ValueError:
                    toy, duration = self.pending_toys.pop_le(sys.maxint)

            work_duration = elf.asign_toy(current_time, toy, hrs)
            heapq.heappush(self.elves, (elf.next_available_time, elf))

            yield toy.id, elf.id, current_time, work_duration, elf.rating
Ejemplo n.º 23
0
 def calculate_final_arrivals(self):
     """
 make self.final_arrival_data a SortedCollection. Final arrival
 for a sequence number is when that sequence number of data and all the
 data before it have arrived, that is, when the data is usable by the
 application. Must be called after self.finish().
 """
     self.final_arrival_data = []
     peak_time = 0.0
     # final arrival vertex always coincides with an arrival vertex
     for vertex in self.arrival_data:
         if vertex[1].ts > peak_time:
             peak_time = vertex[1].ts
             self.final_arrival_data.append((vertex[0], vertex[1].ts))
     self.final_arrival_data = SortedCollection(self.final_arrival_data, key=lambda v: v[0])
Ejemplo n.º 24
0
def new_queue(item):
    key = lambda x: x.date
    return SortedCollection([item], key)
Ejemplo n.º 25
0
class Direction:
    '''
    Represents data moving in one direction in a TCP flow.

    Members:
    * chunks = [tcp.Chunk], sorted by seq_start
    * flow = tcp.Flow, the flow to which the direction belongs
    * seq_start = the sequence number at which the data starts, after finish()
    * arrival_data = [(seq_num, pkt)] or SortedCollection
    * final_arrival_data = SortedCollection, after calculate_final_arrivals()
    '''
    def __init__(self, flow):
        '''
        Sets things up for adding packets.

        Args:
        flow = tcp.Flow
        '''
        self.arrival_data = []
        self.final_arrival_data = None #
        self.closed_cleanly = False # until proven true
        self.chunks = []
        self.flow = flow
        # the seq number of the first byte of data,
        # valid after finish() if self.data is valid
        self.seq_start= None
    def add(self, pkt):
        '''
        Merge the packet into the first chunk it overlaps with. If data was
        added to the end of a chunk, attempts to merge the next chunk (if there
        is one). This way, it is ensured that everything is as fully merged as
        it can be with the current data.

        Args:
        pkt = tcp.Packet
        '''
        # discard packets with no payload. we don't care about them here
        if pkt.data == '':
            return
        # attempt to merge packet with existing chunks
        merged = False
        for i in range(len(self.chunks)):
            chunk = self.chunks[i]
            overlapped, result = chunk.merge(pkt,
                                             self.create_merge_callback(pkt))
            if overlapped: # if the data overlapped
                # if data was added on the back and there is a chunk after this
                if result[1] and i < (len(self.chunks)-1):
                    # try to merge with the next chunk as well
                    # in case that packet bridged the gap
                    overlapped2, result2 = chunk.merge(self.chunks[i+1])
                    if overlapped2: # if that merge worked
                        # data should only be added to back
                        assert( (not result2[0]) and (result2[1]))
                        del self.chunks[i+1] # remove the now-redundant chunk
                merged = True
                break # skip further chunks
        if not merged:
            # nothing overlapped with the packet
            # we need a new chunk
            self.new_chunk(pkt)

    def finish(self):
        '''
        Notifies the direction that there are no more packets coming. This means
        that self.data can be decided upon, and arrival_data can be converted to
        a SortedCollection for querying
        '''
        # set data to the data from the first chunk, if there is one
        if self.chunks:
            self.data = self.chunks[0].data
            self.seq_start = self.chunks[0].seq_start
        else:
            self.data = ''
        self.arrival_data = SortedCollection(self.arrival_data, key=lambda v: v[0])
    def calculate_final_arrivals(self):
        '''
        make self.final_arrival_data a SortedCollection. Final arrival
        for a sequence number is when that sequence number of data and all the
        data before it have arrived, that is, when the data is usable by the
        application. Must be called after self.finish().
        '''
        self.final_arrival_data = []
        peak_time = 0.0
        # final arrival vertex always coincides with an arrival vertex
        for vertex in self.arrival_data:
            if vertex[1].ts > peak_time:
                peak_time = vertex[1].ts
                self.final_arrival_data.append((vertex[0], vertex[1].ts))
        self.final_arrival_data = SortedCollection(
            self.final_arrival_data,
            key=lambda v: v[0]
        )

    def new_chunk(self, pkt):
        '''
        creates a new tcp.Chunk for the pkt to live in. Only called if an
        attempt has been made to merge the packet with all existing chunks.
        '''
        chunk = tcp.Chunk()
        chunk.merge(pkt, self.create_merge_callback(pkt))
        self.chunks.append(chunk)
        self.sort_chunks() # it would be better to insert the packet sorted
    def sort_chunks(self):
        self.chunks.sort(key=lambda chunk: chunk.seq_start)
    def create_merge_callback(self, pkt):
        '''
        Returns a function that will serve as a callback for Chunk. It will
        add the passed sequence number and the packet to self.arrival_data.
        '''
        def callback(seq_num):
            self.arrival_data.append((seq_num, pkt))
        return callback
    def byte_to_seq(self, byte):
        '''
        Converts the passed byte index to a sequence number in the stream. byte
        is assumed to be zero-based.
        '''
        if self.seq_start:
            return byte + self.seq_start
        else:
            return byte + self.flow.first_packet.seq

    def seq_arrival(self, seq_num):
        '''
        returns the packet in which the specified sequence number first arrived.
        self.arrival_data must be a SortedCollection at this point;
        self.finish() must have been called.
        '''
        if self.arrival_data:
            return self.arrival_data.find_le(seq_num)[1]
    def seq_final_arrival(self, seq_num):
        '''
        Returns the time at which the seq number had fully arrived. Will
        calculate final_arrival_data if it has not been already. Only callable
        after self.finish()
        '''
        if not self.final_arrival_data:
            self.calculate_final_arrivals()
        return self.final_arrival_data.find_le(seq_num)[1]
                return item
        return -1

    def slow_find_gt(seq, k):
        'First item with a key-value greater-than or equal to k.'
        for item in seq:
            if item > k:
                return item
        return -1

    from random import choice
    pool = [1.5, 2, 2.0, 3, 3.0, 3.5, 4, 4.0, 4.5]
    for i in range(500):
        for n in range(6):
            s = [choice(pool) for i in range(n)]
            sc = SortedCollection(s)
            s.sort()
            for probe in pool:
                assert repr(ve2no(sc.index,
                                  probe)) == repr(slow_index(s, probe))
                assert repr(ve2no(sc.find, probe)) == repr(slow_find(s, probe))
                assert repr(ve2no(sc.find_le,
                                  probe)) == repr(slow_find_le(s, probe))
                assert repr(ve2no(sc.find_lt,
                                  probe)) == repr(slow_find_lt(s, probe))
                assert repr(ve2no(sc.find_ge,
                                  probe)) == repr(slow_find_ge(s, probe))
                assert repr(ve2no(sc.find_gt,
                                  probe)) == repr(slow_find_gt(s, probe))
            for i, item in enumerate(s):
                assert repr(item) == repr(sc[i])  # test __getitem__
Ejemplo n.º 27
0
 def __init__(self, dna):
     self.dna = dna
     self.byYield = []
     self.D = {}  # (yield, i) => set(Tuple)
     self.L = SortedCollection(
         key=lambda tup: tup.getStart())  # sorted by i
Ejemplo n.º 28
0
class InsertablesCollection():
    def __init__(self, dna):
        self.dna = dna
        self.byYield = []
        self.D = {}  # (yield, i) => set(Tuple)
        self.L = SortedCollection(
            key=lambda tup: tup.getStart())  # sorted by i

        # Combinations of tuples, stored in a collection sorted by start, of
        # collections sorted by end, of non-overlapping combinations of Tuples.
        # Can be accessed with like: groups = C[start][end], which will return
        # an collection of all combinations of non-overlapping insertables between (start, end).
        #self.C = CombinationCollection()

    def __repr__(self):
        return self.D.values().__repr__()

    def beginStage(self):
        self.tuplesForStage = []

    def addToStage(self, tup):
        self.tuplesForStage.append(tup)

    def getTuplesForStage(self):
        return self.tuplesForStage

    def filterStage(self, aFilter):
        self.tuplesForStage = aFilter.filter(self.tuplesForStage)

    def completeStage(self):
        self.addAll(self.tuplesForStage)
        self.tuplesForStage = None

    def addAll(self, tuples):
        for tup in tuples:
            self.add(tup)
            #if len(self.D.values()) > 12:
            #    None
            print "\t\tAdded insertable:" + tup.derivation.toString(self.dna)

    def add(self, tup):
        (i, j, k, l) = tup.coordinates

        # add to byYield
        try:
            setByYield = self.byYield[tup.getYield()]
        except IndexError:
            setByYield = set()
            self.byYield.extend(
                [set() for x in range(len(self.byYield), tup.getYield())]
            )  # fill in any missing entries up to current tuple yield (which may never be used)
            self.byYield.append(setByYield)
        setByYield.add(tup)

        # add to D
        tupleSet = set()
        try:
            tupleSet = self.D[tup.getYield(), i]
        except KeyError:
            self.D[tup.getYield(), i] = tupleSet
        tupleSet.add(tup)

        # add to sorted list
        self.L.insert_right(tup)

    # return all tuples (i,j,k,l) such that i>=start and l<=end, *** sorted by i ***
    def getTuplesInSegment(self, start, end):
        tuples = []
        if (len(self.L) == 0) or (start > self.L[-1].getStart()):
            return tuples
        left = self.L.find_ge(start)  # tuple with lowest i >= start
        for tup in self.L[self.L.index(left):]:
            (i, j, k, l) = tup.coordinates
            if (tup.getStart() > end):
                return tuples
            if (tup.getEnd() > end):
                continue
            tuples.append(tup)
        return tuples

    # Return a list of all tuples
    def getAllTuples(self):
        tuples = []
        for tupleSet in self.D.itervalues():
            tuples.extend(tupleSet)
        return tuples

    # Return a list of tuples for the given yield
    def getTuplesOfYield(self, tYield):
        try:
            return list(self.byYield[tYield])
        except IndexError:
            return []  # no parse entries for stage

    # Return a list of tuples for the given yield
    def getTuplesOfYieldAndI(self, tYield, i):
        if (tYield, i) in self.D:
            return self.D[tYield, i]
        else:
            return set()

    # Return a list of all combinations of non-overlapping insertables in region (start, end)
    def combinations(self, start, end):
        time0 = time.clock()
        segment = self.getTuplesInSegment(start, end)
        nonoverlapping = []
        limit = min(stag13.Probabilities.MAX_INSERTIONS_PER_MIDDLE + 1,
                    len(segment) + 1)
        for k in range(1, limit):
            for combination in itertools.combinations(segment, k):
                try:
                    tup0 = None
                    for tup in combination:
                        if tup0 is None:
                            tup0 = tup
                        else:
                            if tup.overlaps(tup0):
                                raise ValueError()
                            tup0 = tup
                    nonoverlapping.append(combination)
                except ValueError:
                    None
        #print "\t\t---> insertion combinations (%s..%s): %-3s %0.1e s" % (
        #    start, end, len(nonoverlapping), time.clock()-time0)
        return nonoverlapping
Ejemplo n.º 29
0
class nltk_tokenizer(IncrementalTransform):
    '''
    a streamcorpus_pipeline IncrementalTransform that converts a chunk into a new
    chunk with Sentence objects generated using NLTK tokenizers
    '''
    config_name = 'nltk_tokenizer'
    tagger_id = 'nltk_tokenizer'

    def __init__(self, *args, **kwargs):
        super(nltk_tokenizer, self).__init__(*args, **kwargs)
        self.sentence_tokenizer = PunktSentenceTokenizer()
        self.word_tokenizer = WhitespaceTokenizer()  #PunktWordTokenizer()

    def _sentences(self, clean_visible):
        'generate strings identified as sentences'
        previous_end = 0
        clean_visible = clean_visible.decode('utf8')
        assert isinstance(clean_visible, unicode)
        for start, end in self.sentence_tokenizer.span_tokenize(clean_visible):
            ## no need to check start, because the first byte of text
            ## is always first byte of first sentence, and we will
            ## have already made the previous sentence longer on the
            ## end if there was an overlap.
            if start < previous_end:
                start = previous_end
                if start > end:
                    ## skip this sentence... because it was eaten by
                    ## an earlier sentence with a label
                    continue
            try:
                label = self.label_index.find_le(end)
            except ValueError:
                label = None
            if label:
                off = label.offsets[OffsetType.BYTES]
                end = max(off.first + off.length, end)
            previous_end = end
            sent_str = clean_visible[start:end]
            yield start, end, sent_str

    def make_label_index(self, stream_item):
        'make a sortedcollection on body.labels'
        labels = stream_item.body.labels.get(self.config.get('annotator_id'))
        if not labels:
            labels = []

        self.label_index = SortedCollection(
            labels, key=lambda label: label.offsets[OffsetType.BYTES].first)

    def make_sentences(self, stream_item):
        'assemble Sentence and Token objects'
        self.make_label_index(stream_item)
        sentences = []
        token_num = 0
        new_mention_id = 0
        for sent_start, sent_end, sent_str in self._sentences(
                stream_item.body.clean_visible):
            assert isinstance(sent_str, unicode)
            sent = Sentence()
            sentence_pos = 0
            for start, end in self.word_tokenizer.span_tokenize(sent_str):
                try:
                    token_str = sent_str[start:end].encode('utf8')
                except Exception, exc:
                    logger.critical("died on sent_str[%d:%d].encode('utf8')",
                                    start,
                                    end,
                                    exc_info=True)
                    sys.exit('failed to cope with %r in %r' %
                             (sent_str[start:end], sent_str))
                tok = Token(
                    token_num=token_num,
                    token=token_str,
                    sentence_pos=sentence_pos,
                )
                tok.offsets[OffsetType.BYTES] = Offset(
                    type=OffsetType.BYTES,
                    first=sent_start + start,
                    length=end - start,
                )
                ## whitespace tokenizer will never get a token
                ## boundary in the middle of an 'author' label
                try:
                    #logger.debug('searching for %d in %r', sent_start + start, self.label_index._keys)
                    label = self.label_index.find_le(sent_start + start)
                except ValueError:
                    label = None
                if label:
                    off = label.offsets[OffsetType.BYTES]
                    if off.first + off.length > sent_start + start:
                        logger.info('overlapping label: %r' %
                                    label.target.target_id)
                        ## overlaps
                        streamcorpus.add_annotation(tok, label)
                        assert label.annotator.annotator_id in tok.labels

                        logger.info('adding label to tok: %r has %r',
                                    tok.token, label.target.target_id)

                        if label in self.label_to_mention_id:
                            mention_id = self.label_to_mention_id[label]
                        else:
                            mention_id = new_mention_id
                            new_mention_id += 1
                            self.label_to_mention_id[label] = mention_id

                        tok.mention_id = mention_id

                token_num += 1
                sentence_pos += 1
                sent.tokens.append(tok)
            sentences.append(sent)
        return sentences
Ejemplo n.º 30
0
class nltk_tokenizer(IncrementalTransform):
    """
    a streamcorpus_pipeline IncrementalTransform that converts a chunk into a new
    chunk with Sentence objects generated using NLTK tokenizers
    """

    tagger_id = "nltk_tokenizer"

    def __init__(self, config):
        self.config = config
        self.sentence_tokenizer = PunktSentenceTokenizer()
        self.word_tokenizer = WhitespaceTokenizer()  # PunktWordTokenizer()

    def _sentences(self, clean_visible):
        "generate strings identified as sentences"
        previous_end = 0
        clean_visible = clean_visible.decode("utf8")
        assert isinstance(clean_visible, unicode)
        for start, end in self.sentence_tokenizer.span_tokenize(clean_visible):
            ## no need to check start, because the first byte of text
            ## is always first byte of first sentence, and we will
            ## have already made the previous sentence longer on the
            ## end if there was an overlap.
            if start < previous_end:
                start = previous_end
                if start > end:
                    ## skip this sentence... because it was eaten by
                    ## an earlier sentence with a label
                    continue
            try:
                label = self.label_index.find_le(end)
            except ValueError:
                label = None
            if label:
                off = label.offsets[OffsetType.BYTES]
                end = max(off.first + off.length, end)
            previous_end = end
            sent_str = clean_visible[start:end]
            yield start, end, sent_str

    def make_label_index(self, stream_item):
        "make a sortedcollection on body.labels"
        labels = stream_item.body.labels.get(self.config.get("annotator_id"))
        if not labels:
            labels = []

        self.label_index = SortedCollection(labels, key=lambda label: label.offsets[OffsetType.BYTES].first)

    def make_sentences(self, stream_item):
        "assemble Sentence and Token objects"
        self.make_label_index(stream_item)
        sentences = []
        token_num = 0
        new_mention_id = 0
        for sent_start, sent_end, sent_str in self._sentences(stream_item.body.clean_visible):
            assert isinstance(sent_str, unicode)
            sent = Sentence()
            sentence_pos = 0
            for start, end in self.word_tokenizer.span_tokenize(sent_str):
                try:
                    token_str = sent_str[start:end].encode("utf8")
                except Exception, exc:
                    logger.critical("died on sent_str[%d:%d].encode('utf8')", start, end, exc_info=True)
                    sys.exit("failed to cope with %r in %r" % (sent_str[start:end], sent_str))
                tok = Token(token_num=token_num, token=token_str, sentence_pos=sentence_pos)
                tok.offsets[OffsetType.BYTES] = Offset(
                    type=OffsetType.BYTES, first=sent_start + start, length=end - start
                )
                ## whitespace tokenizer will never get a token
                ## boundary in the middle of an 'author' label
                try:
                    # logger.debug('searching for %d in %r', sent_start + start, self.label_index._keys)
                    label = self.label_index.find_le(sent_start + start)
                except ValueError:
                    label = None
                if label:
                    off = label.offsets[OffsetType.BYTES]
                    if off.first + off.length > sent_start + start:
                        logger.info("overlapping label: %r" % label.target.target_id)
                        ## overlaps
                        streamcorpus.add_annotation(tok, label)
                        assert label.annotator.annotator_id in tok.labels

                        logger.info("adding label to tok: %r has %r", tok.token, label.target.target_id)

                        if label in self.label_to_mention_id:
                            mention_id = self.label_to_mention_id[label]
                        else:
                            mention_id = new_mention_id
                            new_mention_id += 1
                            self.label_to_mention_id[label] = mention_id

                        tok.mention_id = mention_id

                token_num += 1
                sentence_pos += 1
                sent.tokens.append(tok)
            sentences.append(sent)
        return sentences
Ejemplo n.º 31
0
class CloseWordsGenerator(object):
    def __init__(self, correct_words, transmatrix=None, max_distance=3):
        self.__store_matrix(transmatrix)
        self.corrects = set(tuple(c for c in w) for w in correct_words)
        self.max_dist = max_distance

    def __store_matrix(self, m):
        """stores every row of the matrix in weighted order
        """
        chars = set(m.iterkeys())
        for inner_d in m.itervalues():
            chars |= set(inner_d.itervalues())
        d = {}
        for c1, row in m.iteritems():
            values = [(c2, w) for c2, w in row.iteritems() if w > 1e-10]
            d[c1] = sorted(values, key=lambda x: x[1])
        self.transitions = d

    def get_char_changes(self, word, src_char):
        row = self.transitions[src_char]
        values = []
        for tgt, weight in row:
            new_words = set(gen_changed(word, src_char, tgt))
            values.append((weight, new_words))
        return sorted(values, key=lambda x: x[0])

    def get_closest(self, word):
        """Computes closest word(s) based on stored transition matrix
        """
        t = self.transitions
        chars = set(word) | set([''])

        if word not in self.change_cache:
            self.change_cache[word] = []
            for c in chars:
                if c not in t or len(t[c]) == 0:
                    continue

                self.change_cache[word] += self.get_char_changes(word, c)

        if len(self.change_cache[word]) == 0:
            del self.change_cache[word]
            return None

        return self.change_cache[word].pop(0)

    def choose_next(self):
        if len(self.not_done) == 0:
            return

        return self.not_done[0]

    def __get_closest_for_seen(self):
        best = [None, set()]
        while len(best[1]) == 0:
            n = self.choose_next()
            if n is None:
                break

            word, (old_weight, old_dist) = n
            # skip if old_word is already as far as it can be
            if old_dist == self.max_dist:
                self.done.add(word)
                self.not_done.remove(self.not_done[0])
                continue

            cl = self.get_closest(word)
            if cl is None:
                self.done.add(word)
                self.not_done.remove(self.not_done[0])
                continue

            change_weight, new_words = cl

            new_weight = old_weight + change_weight
            if best[0] is None:
                best[0] = (new_weight, old_dist + 1)
                best[1] = new_words
            elif new_weight < best[0][0]:
                best[0] = (new_weight, old_dist + 1)
                best[1] = new_words
            elif new_weight == best[0][0]:
                best[1] |= new_words
        return best

    def get_closest_correct(self, word):
        word = tuple(c for c in word)

        # caching variables for speedup
        self.seen = {word: (0., 0)}
        self.change_cache = {}
        self.done = set()
        self.not_done = SortedCollection(key=lambda x: x[1][0])
        self.not_done.insert(self.seen.items()[0])

        while True:
            new_value, new_words = self.__get_closest_for_seen()
            if len(new_words) == 0:
                return None

            correct_words = new_words & self.corrects
            if len(correct_words) > 0:
                return correct_words

            for w in new_words:
                if w not in self.seen:
                    self.seen[w] = new_value
                    self.not_done.insert((w, new_value))
Ejemplo n.º 32
0
class Direction:
    '''
  Represents data moving in one direction in a TCP flow.

  Members:
  * chunks = [tcp.Chunk], sorted by seq_start
  * flow = tcp.Flow, the flow to which the direction belongs
  * seq_start = the sequence number at which the data starts, after finish()
  * arrival_data = [(seq_num, pkt)] or SortedCollection
  * final_arrival_data = SortedCollection, after calculate_final_arrivals()
  '''
    def __init__(self, flow):
        '''
    Sets things up for adding packets.

    Args:
    flow = tcp.Flow
    '''
        self.arrival_data = []
        self.final_arrival_data = None  #
        self.closed_cleanly = False  # until proven true
        self.chunks = []
        self.flow = flow
        # the seq number of the first byte of data,
        # valid after finish() if self.data is valid
        self.seq_start = None

    def add(self, pkt):
        '''
    Merge the packet into the first chunk it overlaps with. If data was
    added to the end of a chunk, attempts to merge the next chunk (if there
    is one). This way, it is ensured that everything is as fully merged as
    it can be with the current data.

    Args:
    pkt = tcp.Packet
    '''
        # discard packets with no payload. we don't care about them here
        if pkt.data == '':
            return
        # attempt to merge packet with existing chunks
        merged = False
        for i in range(len(self.chunks)):
            chunk = self.chunks[i]
            overlapped, result = chunk.merge(pkt,
                                             self.create_merge_callback(pkt))
            if overlapped:  # if the data overlapped
                # if data was added on the back and there is a chunk after this
                if result[1] and i < (len(self.chunks) - 1):
                    # try to merge with the next chunk as well in case that packet
                    # bridged the gap
                    overlapped2, result2 = chunk.merge(self.chunks[i + 1])
                    if overlapped2:  # if that merge worked
                        # data should only be added to back
                        assert ((not result2[0]) and (result2[1]))
                        del self.chunks[i +
                                        1]  # remove the now-redundant chunk
                merged = True
                break  # skip further chunks
        if not merged:
            # Nothing is overlapped with the packet. We need a new chunk.
            self.new_chunk(pkt)

    def finish(self):
        '''
    Notifies the direction that there are no more packets coming. This means
    that self.data can be decided upon, and arrival_data can be converted to
    a SortedCollection for querying
    '''
        # set data to the data from the first chunk, if there is one
        if self.chunks:
            self.data = self.chunks[0].data
            self.seq_start = self.chunks[0].seq_start
        else:
            self.data = ''
        self.arrival_data = SortedCollection(self.arrival_data,
                                             key=lambda v: v[0])

    def calculate_final_arrivals(self):
        '''
    make self.final_arrival_data a SortedCollection. Final arrival
    for a sequence number is when that sequence number of data and all the
    data before it have arrived, that is, when the data is usable by the
    application. Must be called after self.finish().
    '''
        self.final_arrival_data = []
        peak_time = 0.0
        # final arrival vertex always coincides with an arrival vertex
        for vertex in self.arrival_data:
            if vertex[1].ts > peak_time:
                peak_time = vertex[1].ts
                self.final_arrival_data.append((vertex[0], vertex[1].ts))
        self.final_arrival_data = SortedCollection(self.final_arrival_data,
                                                   key=lambda v: v[0])

    def new_chunk(self, pkt):
        '''
    creates a new tcp.Chunk for the pkt to live in. Only called if an
    attempt has been made to merge the packet with all existing chunks.
    '''
        chunk = tcp.Chunk()
        chunk.merge(pkt, self.create_merge_callback(pkt))
        self.chunks.append(chunk)
        self.sort_chunks()  # it would be better to insert the packet sorted

    def sort_chunks(self):
        self.chunks.sort(key=lambda chunk: chunk.seq_start)

    def create_merge_callback(self, pkt):
        '''
    Returns a function that will serve as a callback for Chunk. It will
    add the passed sequence number and the packet to self.arrival_data.
    '''
        def callback(seq_num):
            self.arrival_data.append((seq_num, pkt))

        return callback

    def byte_to_seq(self, byte):
        '''
    Converts the passed byte index to a sequence number in the stream. byte
    is assumed to be zero-based.
    '''
        if self.seq_start:
            return byte + self.seq_start
        else:
            return byte + self.flow.first_packet.seq

    def seq_arrival(self, seq_num):
        '''
    returns the packet in which the specified sequence number first arrived.
    self.arrival_data must be a SortedCollection at this point;
    self.finish() must have been called.
    '''
        if self.arrival_data:
            return self.arrival_data.find_le(seq_num)[1]

    def seq_final_arrival(self, seq_num):
        '''
    Returns the time at which the seq number had fully arrived. Will
    calculate final_arrival_data if it has not been already. Only callable
    after self.finish()
    '''
        if not self.final_arrival_data:
            self.calculate_final_arrivals()
        return self.final_arrival_data.find_le(seq_num)[1]
Ejemplo n.º 33
0
class Direction:
    '''
    Represents data moving in one direction in a TCP flow.

    Members:
    * finished = bool. Indicates whether more packets should be expected.
    * chunks = [tcp.Chunk], sorted by seq_start
    * flow = tcp.Flow, the flow to which the direction belongs
    * arrival_data = SortedCollection([(seq_num, pkt)])
    * final_arrival_data = SortedCollection([(seq_num, ts)])
    * final_data_chunk = Chunk or None, the chunk that contains the final data,
      only after seq_start is valid
    * final_arrival_pointer = the end sequence number of data that has
      completely arrived
    '''
    def __init__(self, flow):
        '''
        Sets things up for adding packets.

        Args:
        flow = tcp.Flow
        '''
        self.finished = False
        self.flow = flow
        self.arrival_data = SortedCollection(key=itemgetter(0))
        self.final_arrival_data = SortedCollection(key=itemgetter(0))
        self.final_arrival_pointer = None
        self.chunks = []
        self.final_data_chunk = None
    def add(self, pkt):
        '''
        Merge the packet into the first chunk it overlaps with. If data was
        added to the end of a chunk, attempts to merge the next chunk (if there
        is one). This way, it is ensured that everything is as fully merged as
        it can be with the current data.

        Args:
        pkt = tcp.Packet
        '''
        if self.finished:
            raise RuntimeError('tried to add packets to a finished tcp.Direction')
        # discard packets with no payload. we don't care about them here
        if pkt.data == '':
            return
        # attempt to merge packet with existing chunks
        merged = False
        for i, chunk in enumerate(self.chunks):
            overlapped, (front, back) = chunk.merge(pkt,
                                             self.create_merge_callback(pkt))
            if overlapped:
                # check if this packet bridged the gap between two chunks
                if back and i < (len(self.chunks)-1):
                    overlapped2, result2 = chunk.merge(self.chunks[i+1])
                    if overlapped2:
                        assert( (not result2[0]) and (result2[1]))
                        del self.chunks[i+1]
                # if this is the main data chunk, calc final arrival
                if self.seq_start and chunk.seq_start == self.seq_start:
                    if back:
                        self.final_arrival_data.insert((self.final_arrival_pointer, pkt.ts))
                    if not self.final_data_chunk:
                        self.final_data_chunk = chunk
                    self.final_arrival_pointer = self.final_data_chunk.seq_end
                merged = True
                break # skip further chunks
        if not merged:
            # nothing overlapped with the packet
            # we need a new chunk
            self.new_chunk(pkt)
    @property
    def data(self):
        '''
        returns the TCP data, as far as it has been determined.
        '''
        if self.final_data_chunk:
            return self.final_data_chunk.data
        else:
            if self.finished:
                return '' # no data was ever added
            else:
                return None # just don't know at all
    @property
    def seq_start(self):
        '''
        starting sequence number, as far as we can tell now.
        '''
        if self.flow.handshake:
            if self is self.flow.fwd:
                return self.flow.handshake[2].seq
            elif self is self.flow.rev:
                return self.flow.handshake[1].seq + 1
            else:
                raise RuntimeError(
                    "holy crap, tcp.Direction has a flow it doesn't belong to")
        elif self.finished:
            if self.chunks:
                return self.chunks[0].seq_start
            else:
                log.warning('getting seq_start from finished tcp.Direction '
                            'with no handshake and no data')
                return None
        else:
            return None
    def finish(self):
        '''
        Notifies the direction that there are no more packets coming. This means
        that self.data can be decided upon.
        '''
        self.finished = True
        if self.chunks and not self.final_data_chunk:
            self.final_data_chunk = self.chunks[0]
    def new_chunk(self, pkt):
        '''
        creates a new tcp.Chunk for the pkt to live in. Only called if an
        attempt has been made to merge the packet with all existing chunks.
        '''
        chunk = tcp.Chunk()
        chunk.merge(pkt, self.create_merge_callback(pkt))
        if self.seq_start and chunk.seq_start == self.seq_start:
            self.final_data_chunk = chunk
            self.final_arrival_pointer = chunk.seq_end
            self.final_arrival_data.insert((pkt.seq, pkt.ts))
        # it would be better to insert the packet sorted here
        self.chunks.append(chunk)
        self.chunks.sort(key=lambda chunk: chunk.seq_start)
    def create_merge_callback(self, pkt):
        '''
        Returns a function that will serve as a callback for Chunk. It will
        add the passed sequence number and the packet to self.arrival_data.
        '''
        def callback(seq_num):
            self.arrival_data.insert((seq_num, pkt))
        return callback
    def byte_to_seq(self, byte):
        '''
        Converts the passed byte index to a sequence number in the stream. byte
        is assumed to be zero-based. Returns None if seq_start is None
        '''
        # TODO better handle case where seq_start is None
        seq_start = self.seq_start
        if seq_start is not None:
            return byte + seq_start
        else:
            return None
    def seq_arrival(self, seq_num):
        '''
        returns the packet in which the specified sequence number first arrived.
        '''
        return self.arrival_data.find_le(seq_num)[1]
    def seq_final_arrival(self, seq_num):
        '''
        Returns the time at which the seq number had fully arrived, that is,
        when all the data before it had also arrived.
        '''
        return self.final_arrival_data.find_le(seq_num)[1]
Ejemplo n.º 34
0
class Direction:
    '''
    Represents data moving in one direction in a TCP flow.

    Members:
    * finished = bool. Indicates whether more packets should be expected.
    * chunks = [tcp.Chunk], sorted by seq_start
    * flow = tcp.Flow, the flow to which the direction belongs
    * arrival_data = SortedCollection([(seq_num, pkt)])
    * final_arrival_data = SortedCollection([(seq_num, ts)])
    * final_data_chunk = Chunk or None, the chunk that contains the final data,
      only after seq_start is valid
    * final_arrival_pointer = the end sequence number of data that has
      completely arrived
    '''
    def __init__(self, flow):
        '''
        Sets things up for adding packets.

        Args:
        flow = tcp.Flow
        '''
        self.finished = False
        self.flow = flow
        self.arrival_data = SortedCollection(key=itemgetter(0))
        self.final_arrival_data = SortedCollection(key=itemgetter(0))
        self.final_arrival_pointer = None
        self.chunks = []
        self.final_data_chunk = None
    def add(self, pkt):
        '''
        Merge the packet into the first chunk it overlaps with. If data was
        added to the end of a chunk, attempts to merge the next chunk (if there
        is one). This way, it is ensured that everything is as fully merged as
        it can be with the current data.

        Args:
        pkt = tcp.Packet
        '''
        if self.finished:
            raise RuntimeError('tried to add packets to a finished tcp.Direction')
        # discard packets with no payload. we don't care about them here
        if pkt.data == '':
            return
        # attempt to merge packet with existing chunks
        merged = False
        for i, chunk in enumerate(self.chunks):
            overlapped, (front, back) = chunk.merge(pkt,
                                             self.create_merge_callback(pkt))
            if overlapped:
                # check if this packet bridged the gap between two chunks
                if back and i < (len(self.chunks)-1):
                    overlapped2, result2 = chunk.merge(self.chunks[i+1])
                    if overlapped2:
                        assert( (not result2[0]) and (result2[1]))
                        del self.chunks[i+1]
                # if this is the main data chunk, calc final arrival
                if self.seq_start and chunk.seq_start == self.seq_start:
                    if front: # packet was first in stream but just now arriving
                        self.final_arrival_data.insert((self.seq_start, pkt.ts))
                    if back: # usual case
                        self.final_arrival_data.insert((self.final_arrival_pointer, pkt.ts))
                    if not self.final_data_chunk:
                        self.final_data_chunk = chunk
                    self.final_arrival_pointer = self.final_data_chunk.seq_end
                merged = True
                break # skip further chunks
        if not merged:
            # nothing overlapped with the packet
            # we need a new chunk
            self.new_chunk(pkt)
    @property
    def data(self):
        '''
        returns the TCP data, as far as it has been determined.
        '''
        if self.final_data_chunk:
            return self.final_data_chunk.data
        else:
            if self.finished:
                return '' # no data was ever added
            else:
                return None # just don't know at all
    @property
    def seq_start(self):
        '''
        starting sequence number, as far as we can tell now.
        '''
        if self.flow.handshake:
            if self is self.flow.fwd:
                return self.flow.handshake[2].seq
            elif self is self.flow.rev:
                return self.flow.handshake[1].seq + 1
            else:
                raise RuntimeError(
                    "holy crap, tcp.Direction has a flow it doesn't belong to")
        elif self.finished:
            if self.chunks:
                return self.chunks[0].seq_start
            else:
                log.warning('getting seq_start from finished tcp.Direction '
                            'with no handshake and no data')
                return None
        else:
            return None
    def finish(self):
        '''
        Notifies the direction that there are no more packets coming. This means
        that self.data can be decided upon. Also calculates final_arrival for
        any packets that arrived while seq_start was None
        '''
        self.finished = True
        # calculate final_arrival
        if not self.final_arrival_data:
            peak_time = 0.0
            for vertex in self.arrival_data:
                if vertex[1].ts > peak_time:
                    peak_time = vertex[1].ts
                    self.final_arrival_data.insert((vertex[0], vertex[1].ts))
        if self.chunks and not self.final_data_chunk:
            self.final_data_chunk = self.chunks[0]
    def new_chunk(self, pkt):
        '''
        creates a new tcp.Chunk for the pkt to live in. Only called if an
        attempt has been made to merge the packet with all existing chunks.
        '''
        chunk = tcp.Chunk()
        chunk.merge(pkt, self.create_merge_callback(pkt))
        if self.seq_start and chunk.seq_start == self.seq_start:
            self.final_data_chunk = chunk
            self.final_arrival_pointer = chunk.seq_end
            self.final_arrival_data.insert((pkt.seq, pkt.ts))
        # it would be better to insert the chunk sorted here
        self.chunks.append(chunk)
        self.chunks.sort(key=lambda chunk: chunk.seq_start)
    def create_merge_callback(self, pkt):
        '''
        Returns a function that will serve as a callback for Chunk. It will
        add the passed sequence number and the packet to self.arrival_data.
        '''
        def callback(seq_num):
            self.arrival_data.insert((seq_num, pkt))
        return callback
    def byte_to_seq(self, byte):
        '''
        Converts the passed byte index to a sequence number in the stream. byte
        is assumed to be zero-based. Returns None if seq_start is None
        '''
        # TODO better handle case where seq_start is None
        seq_start = self.seq_start
        if seq_start is not None:
            return byte + seq_start
        else:
            return None
    def seq_arrival(self, seq_num):
        '''
        returns the packet in which the specified sequence number first arrived.
        '''
        try:
            return self.arrival_data.find_le(seq_num)[1]
        except ValueError:
            return None
    def seq_final_arrival(self, seq_num):
        '''
        Returns the time at which the seq number had fully arrived, that is,
        when all the data before it had also arrived.
        '''
        try:
            return self.final_arrival_data.find_le(seq_num)[1]
        except:
            return None
Ejemplo n.º 35
0
class KeyframeSceneTemplate(SceneTemplate):


	class KeyframePlayableScene(PlayableScene):
		
		def __init__(self,scene_template,**kwargs):
				
			self.colors = kwargs.pop('colors')
			PlayableScene.__init__(self,**kwargs)
			self.scene_template = scene_template
			
			self.seconds_in_scene = 0
			self.beats_in_scene = 0
			
			self._prev_frame = None # The previous frame before 'now'
			self._next_frame = None # The next frame after 'now'
			self._prev_at = None # The time in seconds at which the previous frame occurred
			self._next_at = None # The time in seconds at which the next frame will occur
			self._prev_rgb = None # [(0,0,0)]*PANEL_NUM
			self._next_rgb = None #[(0,0,0)]*PANEL_NUM
			
		def step(self,seconds):
			if not PlayableScene.step(self,seconds):
				# If the nothing happened in this step, then don't bother with calc
				# this probably just means that seconds = 0
				return False
				
			self.seconds_in_scene += seconds
			self.beats_in_scene += self.beats_in_step
			if self.beats_in_scene >= self.scene_template.scene_length: # TODO: Determine if this should be >= or >
				self.beats_in_scene = self.beats_in_scene % self.scene_template.scene_length
				self.seconds_in_scene = self.seconds_in_scene % (self.scene_template.scene_length*self.spb())
			
			return True # Remember to return True or nothing happens!
			
			
		def rgb(self):
			if not self._prev_frame or not self._next_frame or self.beats() != 0:
				old_prev_frame = self._prev_frame
				old_next_frame = self._next_frame
				looped_forwards_to_beginning = False
				looped_backwards_to_end = False
				try:
					self._prev_frame = self.scene_template._keyframes.find_le(self.beats_in_scene)
				except ValueError:
					# If we can't find a key less than this beat then this is the beginning
					# so use the last frame
					self._prev_frame = self.scene_template._keyframes[-1]
					looped_backwards_to_end = True
				try:
					self._next_frame = self.scene_template._keyframes.find_gt(self.beats_in_scene)
				except ValueError:
					# If we can't find a key great than this beat then this is the end so
					# use the first frame
					self._next_frame = self.scene_template._keyframes[0]
					looped_forwards_to_beginning = True
					
				# If the keyframes have changed, recalculate rgb values
				if old_prev_frame != self._prev_frame or old_next_frame != self._next_frame: 
					
					if looped_backwards_to_end:
						# If we have to loop around to the end to get the 'previous' keyframe
						# then do calculations with that in mind
						from_end = self.scene_template.scene_length*self.spb()-self._prev_frame[0]*self.spb()
						self._prev_at = -(from_end)
					else:
						# Otherwise the previous time of the keyframe is whatever beat that keyframe is on times secs/beat
						self._prev_at = self._prev_frame[0]*self.spb()
					
					
					if looped_forwards_to_beginning:
						from_begin = self._next_frame[0]*self.spb()
						self._next_at = self.scene_template.scene_length*self.spb() + from_begin
						
					else:
						self._next_at = self._next_frame[0]*self.spb()
						
					
					
					assert(self._prev_at < self.seconds_in_scene)
					assert(self._next_at > self.seconds_in_scene)
				
					self._prev_rgb = [self.colors[x] for x in self._prev_frame[1].panel_colors]
					self._next_rgb = [self.colors[x] for x in self._next_frame[1].panel_colors]
					
					#print 'Using new keyframe'
					#print self._prev_rgb
					#print self._next_rgb
				else:
					pass
					#print 'Using same keyframe'
					
			# If the next keyframe is a 'switch to' frame
			if isinstance(self._next_frame[1],KeyframeSceneTemplate.SwitchToFrame):
				# then just play the previously calculated rgb
				return self._prev_rgb # Just play the previous
			elif isinstance(self._next_frame[1],KeyframeSceneTemplate.FadeToFrame):
				# Calculate the rgb for 'now'
				
				#      seconds after the last keyframe  OVER  total seconds between keyframe
				progress = (self.seconds_in_scene - self._prev_at) / (self._next_at - self._prev_at)
				#print "We are %s%% through this area between keyframes at %s and %s" % ((progress*100),self._prev_at,self._next_at)
				assert (0 < progress < 1)
				
				now_rgb = []
				
				# Look at each panel
				for i in xrange(PANEL_NUM):
					now_rgb.append( \
						map(lambda a,b: int((b-a)*progress+a), self._prev_rgb[i],self._next_rgb[i]), 
						# (self._next_rgb[i] - self._prev_rgb[i])*progress) + self._prev_rgb[i] \
					)
					
				return now_rgb 
			
			else:
				raise ValueError, 'Unknown frame type encountered %s at %s' % (self._next_frame[1],self._next_frame[0])
	
	
	def __init__(self,scene_length):
		self._keyframes = SortedCollection(key=itemgetter(0))
		self.scene_length = scene_length
		
	
	def set_wall_frame(self,at_beat,panel_color,fade_to=True):
		return self.set_frame(at_beat,[panel_color]*PANEL_NUM,fade_to)
	
	def set_frame(self,at_beat,panel_colors,fade_to=True):
		"""`panel_colors` is either a list which is the same length
		as the number of panels in the lightwall and whose content
		is an integer 'color id' OR it is a dictionary with
		keys that are integer values no greater than the number 
		of panels in the lightwall and whose value is a integer 'color id'.
		e.g., [3,1,None,4,...] is equivalent to {1:3,2:1,4:4}
		
		# If there is already a keyframe at the given beat, it is overwritten
		"""
		if type(at_beat) != int:
			print "Warning: settings non-integer beat"
		
		if type(panel_colors) == dict:
			l = []
			for i in xrange(PANEL_NUM):
				l.append(panel_colors.get(i,None))
			panel_colors = l
		
		if at_beat < 0:
			raise ValueError,"Tried to add a keyframe before beginning of scene (<0)"
		elif at_beat > self.scene_length:
			raise ValueError, "Tried to add a keyframe after end of scene (>scene_length)"
		
		
		try:
			# try to remove a keyframe at that beat because we want to
			# overwrite it if there's one there
			self._keyframes.remove(self._keyframes.find(at_beat))
		except ValueError:
			pass
		
		if fade_to:
			self._keyframes.insert((at_beat,KeyframeSceneTemplate.FadeToFrame(panel_colors)))
		else:
			self._keyframes.insert((at_beat,KeyframeSceneTemplate.SwitchToFrame(panel_colors)))
		
		
	def remove_frame(self,at_beat):
		self._keyframes.remove(self._keyframes.find(at_beat))

				
	
	def bind(self,scene_data):
		return KeyframeSceneTemplate.KeyframePlayableScene(self,**scene_data)
		
	
	
	class Frame(object):
		def __init__(self,panel_colors):
			self.panel_colors = panel_colors
	class FadeToFrame(Frame): pass
	class SwitchToFrame(Frame): pass
Ejemplo n.º 36
0
	def __init__(self,scene_length):
		self._keyframes = SortedCollection(key=itemgetter(0))
		self.scene_length = scene_length
class nltk_tokenizer(IncrementalTransform):
    '''
    a streamcorpus_pipeline IncrementalTransform that converts a chunk into a new
    chunk with Sentence objects generated using NLTK tokenizers
    '''
    config_name = 'nltk_tokenizer'
    tagger_id = 'nltk_tokenizer'
    def __init__(self, *args, **kwargs):
        super(nltk_tokenizer, self).__init__(*args, **kwargs)
        self.sentence_tokenizer = PunktSentenceTokenizer()
        self.word_tokenizer = WhitespaceTokenizer() #PunktWordTokenizer()

    def _sentences(self, clean_visible):
        'generate strings identified as sentences'
        previous_end = 0
        clean_visible = clean_visible.decode('utf8')
        assert isinstance(clean_visible, unicode)
        for start, end in self.sentence_tokenizer.span_tokenize(clean_visible):
            ## no need to check start, because the first byte of text
            ## is always first byte of first sentence, and we will
            ## have already made the previous sentence longer on the
            ## end if there was an overlap.
            if  start < previous_end:
                start = previous_end
                if start > end:
                    ## skip this sentence... because it was eaten by
                    ## an earlier sentence with a label
                    continue
            try:
                label = self.label_index.find_le(end)
            except ValueError:
                label = None
            if label:
                off = label.offsets[OffsetType.BYTES]
                end = max(off.first + off.length, end)
            previous_end = end
            sent_str = clean_visible[start:end]
            yield start, end, sent_str

    def make_label_index(self, stream_item):
        'make a sortedcollection on body.labels'
        labels = stream_item.body.labels.get(self.config.get('annotator_id'))
        if not labels:
            labels = []

        self.label_index = SortedCollection(
            labels,
            key=lambda label: label.offsets[OffsetType.BYTES].first)

    def make_sentences(self, stream_item):
        'assemble Sentence and Token objects'
        self.make_label_index(stream_item)
        sentences = []
        token_num = 0
        new_mention_id = 0
        for sent_start, sent_end, sent_str in self._sentences(stream_item.body.clean_visible):
            assert isinstance(sent_str, unicode)
            sent = Sentence()
            sentence_pos = 0
            for start, end in self.word_tokenizer.span_tokenize(sent_str):
                token_str = sent_str[start:end].encode('utf8')
                tok = Token(
                    token_num=token_num,
                    token=token_str,
                    sentence_pos=sentence_pos,
                )
                tok.offsets[OffsetType.BYTES] = Offset(
                    type=OffsetType.BYTES, 
                    first=sent_start + start,
                    length = end - start,
                )
                ## whitespace tokenizer will never get a token
                ## boundary in the middle of an 'author' label
                try:
                    #logger.debug('searching for %d in %r', sent_start + start, self.label_index._keys)
                    label = self.label_index.find_le(sent_start + start)
                except ValueError:
                    label = None
                if label:
                    off = label.offsets[OffsetType.BYTES]
                    if off.first + off.length > sent_start + start:
                        logger.info('overlapping label: %r' % label.target.target_id)
                        ## overlaps
                        streamcorpus.add_annotation(tok, label)
                        assert label.annotator.annotator_id in tok.labels

                        logger.info('adding label to tok: %r has %r',
                                     tok.token, label.target.target_id)

                        if label in self.label_to_mention_id:
                            mention_id = self.label_to_mention_id[label]
                        else:
                            mention_id = new_mention_id
                            new_mention_id += 1
                            self.label_to_mention_id[label] = mention_id

                        tok.mention_id = mention_id

                token_num += 1
                sentence_pos += 1
                sent.tokens.append(tok)
            sentences.append(sent)
        return sentences

    def process_item(self, stream_item, context=None):
        if not hasattr(stream_item.body, 'clean_visible') or not stream_item.body.clean_visible:
            return stream_item
            
        self.label_index = None
        self.label_to_mention_id = dict()
        stream_item.body.sentences[self.tagger_id] = self.make_sentences(stream_item)

        return stream_item

    def __call__(self, stream_item, context=None):
        ## support the legacy callable API
        return self.process_item(stream_item, context)