def __init__(self, num_elves): self.elves = [] for i in xrange(1, num_elves + 1): elf = Elf(i) heapq.heappush(self.elves, (elf.next_available_time, elf)) self.pending_toys = SortedCollection(key=itemgetter(1))
def make_label_index(self, stream_item): 'make a sortedcollection on body.labels' labels = stream_item.body.labels.get(self.config.get('annotator_id')) if not labels: labels = [] self.label_index = SortedCollection( labels, key=lambda label: label.offsets[OffsetType.BYTES].first)
def on_orderbook_snapshot(data): order_books[symbol]['bids'] = SortedCollection([[float(pq) for pq in bid] for bid in data['bids']], key=itemgetter(0)) order_books[symbol]['asks'] = SortedCollection([[float(pq) for pq in ask] for ask in data['asks']], key=itemgetter(0))
def __init__(self, markets: list, ticker_observer: TickerObserver): super().__init__(markets, ticker_observer) self.mapping = {} # maps pairs to bitfinex channel ids for m in markets: self.order_books[m] = { 'bids': SortedCollection(key=itemgetter(0)), 'asks': SortedCollection(key=itemgetter(0)) }
def __init__(self, timeline=None): """ Initialize. Kwargs: timeline (list of Actions): a timeline of all Replay Actions """ if timeline is not None: self.timeline = timeline else: self.timeline = SortedCollection(key=attrgetter('timestamp')) self.repSets = [] #TODO: Define repset class
def _offset_labels(stream_item, aligner_data, offset_type='BYTES'): ## get a set of tokens -- must have OffsetType.<offset_type> type offsets. offset_type = OffsetType._NAMES_TO_VALUES[offset_type] sentences = stream_item.body.sentences[aligner_data['tagger_id']] ## These next few steps are probably the most ## memory intensive, because they fully ## instantiate all the tokens. token_collection = SortedCollection( itertools.chain(*[sent.tokens for sent in sentences]), key=lambda tok: tok.offsets[offset_type].first ) ## if labels on ContentItem, then make labels on Tokens for annotator_id in stream_item.body.labels: if annotator_id != aligner_data['annotator_id']: continue for label in stream_item.body.labels[annotator_id]: ## remove the offset from the label, because we are ## putting it into the token label_off = label.offsets.pop( offset_type ) assert label_off.length == len(label_off.value) #print 'L: %d\t%r\t%r' % (label_off.first, label_off.value, # '\n'.join(hope_original.split('\n')[label_off.first:label_off.first+label_off.length])) #print 'tc %d %r' % (len(token_collection), token_collection._keys) #print 'label_off.first=%d, length=%d, value=%r' % (label_off.first, label_off.length, label_off.value) toks = token_collection.find_range( label_off.first, label_off.first + label_off.length) #print "find_le: ", token_collection.find_le(label_off.first) toks = list(toks) #print 'aligned tokens', toks for tok in toks: add_annotation(tok, label) ## only for debugging assert tok.token is not None, tok.token if not tok.token in label_off.value: sys.exit('%r not in %r' % \ ([(t.offsets[offset_type].first, t.token) for t in toks], label_off.value))
def _offset_labels(stream_item, aligner_data, offset_type='BYTES'): ## get a set of tokens -- must have OffsetType.<offset_type> type offsets. offset_type = OffsetType._NAMES_TO_VALUES[offset_type] sentences = stream_item.body.sentences[aligner_data['tagger_id']] ## These next few steps are probably the most ## memory intensive, because they fully ## instantiate all the tokens. token_collection = SortedCollection( itertools.chain(*[sent.tokens for sent in sentences]), key=lambda tok: tok.offsets[offset_type].first) ## if labels on ContentItem, then make labels on Tokens for annotator_id in stream_item.body.labels: if annotator_id != aligner_data['annotator_id']: continue for label in stream_item.body.labels[annotator_id]: ## remove the offset from the label, because we are ## putting it into the token label_off = label.offsets.pop(offset_type) assert label_off.length == len(label_off.value) #print 'L: %d\t%r\t%r' % (label_off.first, label_off.value, # '\n'.join(hope_original.split('\n')[label_off.first:label_off.first+label_off.length])) #print 'tc %d %r' % (len(token_collection), token_collection._keys) #print 'label_off.first=%d, length=%d, value=%r' % (label_off.first, label_off.length, label_off.value) toks = token_collection.find_range( label_off.first, label_off.first + label_off.length) #print "find_le: ", token_collection.find_le(label_off.first) toks = list(toks) #print 'aligned tokens', toks for tok in toks: add_annotation(tok, label) ## only for debugging assert tok.token is not None, tok.token if not tok.token in label_off.value: sys.exit('%r not in %r' % \ ([(t.offsets[offset_type].first, t.token) for t in toks], label_off.value))
def __init__(self, flow): ''' Sets things up for adding packets. Args: flow = tcp.Flow ''' self.finished = False self.flow = flow self.arrival_data = SortedCollection(key=itemgetter(0)) self.final_arrival_data = SortedCollection(key=itemgetter(0)) self.final_arrival_pointer = None self.chunks = [] self.final_data_chunk = None
def finish(self): ''' Notifies the direction that there are no more packets coming. This means that self.data can be decided upon, and arrival_data can be converted to a SortedCollection for querying ''' # set data to the data from the first chunk, if there is one if self.chunks: self.data = self.chunks[0].data self.seq_start = self.chunks[0].seq_start else: self.data = '' self.arrival_data = SortedCollection(self.arrival_data, key=lambda v: v[0])
def make_label_index(self, stream_item): "make a sortedcollection on body.labels" labels = stream_item.body.labels.get(self.config.get("annotator_id")) if not labels: labels = [] self.label_index = SortedCollection(labels, key=lambda label: label.offsets[OffsetType.BYTES].first)
def calculate_final_arrivals(self): ''' make self.final_arrival_data a SortedCollection. Final arrival for a sequence number is when that sequence number of data and all the data before it have arrived, that is, when the data is usable by the application. Must be called after self.finish(). ''' self.final_arrival_data = [] peak_time = 0.0 # final arrival vertex always coincides with an arrival vertex for vertex in self.arrival_data: if vertex[1].ts > peak_time: peak_time = vertex[1].ts self.final_arrival_data.append((vertex[0], vertex[1].ts)) self.final_arrival_data = SortedCollection(self.final_arrival_data, key=lambda v: v[0])
def __init__(self, num_elves): self.elves = [] for i in xrange(1, num_elves+1): elf = Elf(i) heapq.heappush(self.elves, (elf.next_available_time, elf)) self.pending_toys = SortedCollection(key=itemgetter(1))
class Replay(object): """ A Replay object """ def __init__(self, timeline=None): """ Initialize. Kwargs: timeline (list of Actions): a timeline of all Replay Actions """ if timeline is not None: self.timeline = timeline else: self.timeline = SortedCollection(key=attrgetter('timestamp')) self.repSets = [] #TODO: Define repset class def insertAction(self, action): if isinstance(action, Action): self.timeline.insert(action) def insertActions(self, actionList): for action in actionList: if isinstance(action, Action): self.insertAction(action) def playback(self): for action in self.timeline: print str(action) def __eq__(self, other): if isinstance(other, Replay): # TODO: Is this good enough? return self.timeline == other.timeline return NotImplemented def __ne__(self, other): result = self.__eq__(other) if result is NotImplemented: return result return not result
def line_offset_labels(stream_item, aligner_data): ## get a set of tokens -- must have OffsetType.LINES in them. sentences = stream_item.body.sentences[aligner_data['tagger_id']] ## if labels on ContentItem, then make labels on Tokens for annotator_id in stream_item.body.labels: if annotator_id != aligner_data['annotator_id']: continue for label in stream_item.body.labels[annotator_id]: ## remove the offset from the label, because we are ## putting it into the token label_off = label.offsets.pop(OffsetType.LINES) assert label_off.length == len(label_off.value.split('\n')) #print 'L: %d\t%r\t%r' % (label_off.first, label_off.value, # '\n'.join(hope_original.split('\n')[label_off.first: # label_off.first+label_off.length])) ## These next few steps are probably the most ## memory intensive, because they fully ## instantiate all the tokens. token_collection = SortedCollection( itertools.chain(*[sent.tokens for sent in sentences]), key=lambda tok: tok.offsets[OffsetType.LINES].first ) toks = token_collection.find_range( label_off.first, label_off.first + label_off.length) for tok in toks: add_annotation(tok, label) ## only for debugging if not tok.token or tok.token not in label_off.value: sys.exit('%r not in %r' % \ ([(t.offsets[OffsetType.LINES].first, t.token) for t in toks], label_off.value))
def line_offset_labels(stream_item, aligner_data): ## get a set of tokens -- must have OffsetType.LINES in them. sentences = stream_item.body.sentences[aligner_data['tagger_id']] ## if labels on ContentItem, then make labels on Tokens for annotator_id in stream_item.body.labels: if annotator_id != aligner_data['annotator_id']: continue for label in stream_item.body.labels[annotator_id]: ## remove the offset from the label, because we are ## putting it into the token label_off = label.offsets.pop(OffsetType.LINES) assert label_off.length == len(label_off.value.split('\n')) #print 'L: %d\t%r\t%r' % (label_off.first, label_off.value, # '\n'.join(hope_original.split('\n')[label_off.first: # label_off.first+label_off.length])) ## These next few steps are probably the most ## memory intensive, because they fully ## instantiate all the tokens. token_collection = SortedCollection( itertools.chain(*[sent.tokens for sent in sentences]), key=lambda tok: tok.offsets[OffsetType.LINES].first) toks = token_collection.find_range( label_off.first, label_off.first + label_off.length) for tok in toks: add_annotation(tok, label) ## only for debugging if not tok.token or tok.token not in label_off.value: sys.exit('%r not in %r' % \ ([(t.offsets[OffsetType.LINES].first, t.token) for t in toks], label_off.value))
def get_closest_correct(self, word): word = tuple(c for c in word) # caching variables for speedup self.seen = {word: (0., 0)} self.change_cache = {} self.done = set() self.not_done = SortedCollection(key=lambda x: x[1][0]) self.not_done.insert(self.seen.items()[0]) while True: new_value, new_words = self.__get_closest_for_seen() if len(new_words) == 0: return None correct_words = new_words & self.corrects if len(correct_words) > 0: return correct_words for w in new_words: if w not in self.seen: self.seen[w] = new_value self.not_done.insert((w, new_value))
class Solution: def __init__(self, num_elves): self.elves = [] for i in xrange(1, num_elves+1): elf = Elf(i) heapq.heappush(self.elves, (elf.next_available_time, elf)) self.pending_toys = SortedCollection(key=itemgetter(1)) def solve(self, toys): hrs = Hours() next_toy = None current_time = 540 # Santa's Workshop opens Jan 1, 2014 9:00 (= 540 minutes) while True: next_elf_time, elf = heapq.heappop(self.elves) current_time = max(current_time, next_elf_time) if (next_toy != None and next_toy.arrival_minute <= current_time): self.pending_toys.insert((next_toy, next_toy.duration)) next_toy = None if (next_toy == None): for toy in toys: if (toy.arrival_minute <= current_time): self.pending_toys.insert((toy, toy.duration)) else: next_toy = toy break if (len(self.pending_toys) == 0 and next_toy == None): raise StopIteration() if (len(self.pending_toys) == 0): current_time = next_toy.arrival_minute continue remaining_time = hrs.get_remaining_sanctioned_time(current_time) if (remaining_time == hrs.sanctioned_minutes_per_day and elf.rating >= 4): toy, duration = self.pending_toys.pop_le(sys.maxint) else: try: toy, duration = self.pending_toys.pop_le(remaining_time) except ValueError: toy, duration = self.pending_toys.pop_le(sys.maxint) work_duration = elf.asign_toy(current_time, toy, hrs) heapq.heappush(self.elves, (elf.next_available_time, elf)) yield toy.id, elf.id, current_time, work_duration, elf.rating
class Solution: def __init__(self, num_elves): self.elves = [] for i in xrange(1, num_elves + 1): elf = Elf(i) heapq.heappush(self.elves, (elf.next_available_time, elf)) self.pending_toys = SortedCollection(key=itemgetter(1)) def solve(self, toys): hrs = Hours() next_toy = None current_time = 540 # Santa's Workshop opens Jan 1, 2014 9:00 (= 540 minutes) while True: next_elf_time, elf = heapq.heappop(self.elves) current_time = max(current_time, next_elf_time) if (next_toy != None and next_toy.arrival_minute <= current_time): self.pending_toys.insert((next_toy, next_toy.duration)) next_toy = None if (next_toy == None): for toy in toys: if (toy.arrival_minute <= current_time): self.pending_toys.insert((toy, toy.duration)) else: next_toy = toy break if (len(self.pending_toys) == 0 and next_toy == None): raise StopIteration() if (len(self.pending_toys) == 0): current_time = next_toy.arrival_minute continue remaining_time = hrs.get_remaining_sanctioned_time(current_time) if (remaining_time == hrs.sanctioned_minutes_per_day and elf.rating >= 4): toy, duration = self.pending_toys.pop_le(sys.maxint) else: try: toy, duration = self.pending_toys.pop_le(remaining_time) except ValueError: toy, duration = self.pending_toys.pop_le(sys.maxint) work_duration = elf.asign_toy(current_time, toy, hrs) heapq.heappush(self.elves, (elf.next_available_time, elf)) yield toy.id, elf.id, current_time, work_duration, elf.rating
def calculate_final_arrivals(self): """ make self.final_arrival_data a SortedCollection. Final arrival for a sequence number is when that sequence number of data and all the data before it have arrived, that is, when the data is usable by the application. Must be called after self.finish(). """ self.final_arrival_data = [] peak_time = 0.0 # final arrival vertex always coincides with an arrival vertex for vertex in self.arrival_data: if vertex[1].ts > peak_time: peak_time = vertex[1].ts self.final_arrival_data.append((vertex[0], vertex[1].ts)) self.final_arrival_data = SortedCollection(self.final_arrival_data, key=lambda v: v[0])
def new_queue(item): key = lambda x: x.date return SortedCollection([item], key)
class Direction: ''' Represents data moving in one direction in a TCP flow. Members: * chunks = [tcp.Chunk], sorted by seq_start * flow = tcp.Flow, the flow to which the direction belongs * seq_start = the sequence number at which the data starts, after finish() * arrival_data = [(seq_num, pkt)] or SortedCollection * final_arrival_data = SortedCollection, after calculate_final_arrivals() ''' def __init__(self, flow): ''' Sets things up for adding packets. Args: flow = tcp.Flow ''' self.arrival_data = [] self.final_arrival_data = None # self.closed_cleanly = False # until proven true self.chunks = [] self.flow = flow # the seq number of the first byte of data, # valid after finish() if self.data is valid self.seq_start= None def add(self, pkt): ''' Merge the packet into the first chunk it overlaps with. If data was added to the end of a chunk, attempts to merge the next chunk (if there is one). This way, it is ensured that everything is as fully merged as it can be with the current data. Args: pkt = tcp.Packet ''' # discard packets with no payload. we don't care about them here if pkt.data == '': return # attempt to merge packet with existing chunks merged = False for i in range(len(self.chunks)): chunk = self.chunks[i] overlapped, result = chunk.merge(pkt, self.create_merge_callback(pkt)) if overlapped: # if the data overlapped # if data was added on the back and there is a chunk after this if result[1] and i < (len(self.chunks)-1): # try to merge with the next chunk as well # in case that packet bridged the gap overlapped2, result2 = chunk.merge(self.chunks[i+1]) if overlapped2: # if that merge worked # data should only be added to back assert( (not result2[0]) and (result2[1])) del self.chunks[i+1] # remove the now-redundant chunk merged = True break # skip further chunks if not merged: # nothing overlapped with the packet # we need a new chunk self.new_chunk(pkt) def finish(self): ''' Notifies the direction that there are no more packets coming. This means that self.data can be decided upon, and arrival_data can be converted to a SortedCollection for querying ''' # set data to the data from the first chunk, if there is one if self.chunks: self.data = self.chunks[0].data self.seq_start = self.chunks[0].seq_start else: self.data = '' self.arrival_data = SortedCollection(self.arrival_data, key=lambda v: v[0]) def calculate_final_arrivals(self): ''' make self.final_arrival_data a SortedCollection. Final arrival for a sequence number is when that sequence number of data and all the data before it have arrived, that is, when the data is usable by the application. Must be called after self.finish(). ''' self.final_arrival_data = [] peak_time = 0.0 # final arrival vertex always coincides with an arrival vertex for vertex in self.arrival_data: if vertex[1].ts > peak_time: peak_time = vertex[1].ts self.final_arrival_data.append((vertex[0], vertex[1].ts)) self.final_arrival_data = SortedCollection( self.final_arrival_data, key=lambda v: v[0] ) def new_chunk(self, pkt): ''' creates a new tcp.Chunk for the pkt to live in. Only called if an attempt has been made to merge the packet with all existing chunks. ''' chunk = tcp.Chunk() chunk.merge(pkt, self.create_merge_callback(pkt)) self.chunks.append(chunk) self.sort_chunks() # it would be better to insert the packet sorted def sort_chunks(self): self.chunks.sort(key=lambda chunk: chunk.seq_start) def create_merge_callback(self, pkt): ''' Returns a function that will serve as a callback for Chunk. It will add the passed sequence number and the packet to self.arrival_data. ''' def callback(seq_num): self.arrival_data.append((seq_num, pkt)) return callback def byte_to_seq(self, byte): ''' Converts the passed byte index to a sequence number in the stream. byte is assumed to be zero-based. ''' if self.seq_start: return byte + self.seq_start else: return byte + self.flow.first_packet.seq def seq_arrival(self, seq_num): ''' returns the packet in which the specified sequence number first arrived. self.arrival_data must be a SortedCollection at this point; self.finish() must have been called. ''' if self.arrival_data: return self.arrival_data.find_le(seq_num)[1] def seq_final_arrival(self, seq_num): ''' Returns the time at which the seq number had fully arrived. Will calculate final_arrival_data if it has not been already. Only callable after self.finish() ''' if not self.final_arrival_data: self.calculate_final_arrivals() return self.final_arrival_data.find_le(seq_num)[1]
return item return -1 def slow_find_gt(seq, k): 'First item with a key-value greater-than or equal to k.' for item in seq: if item > k: return item return -1 from random import choice pool = [1.5, 2, 2.0, 3, 3.0, 3.5, 4, 4.0, 4.5] for i in range(500): for n in range(6): s = [choice(pool) for i in range(n)] sc = SortedCollection(s) s.sort() for probe in pool: assert repr(ve2no(sc.index, probe)) == repr(slow_index(s, probe)) assert repr(ve2no(sc.find, probe)) == repr(slow_find(s, probe)) assert repr(ve2no(sc.find_le, probe)) == repr(slow_find_le(s, probe)) assert repr(ve2no(sc.find_lt, probe)) == repr(slow_find_lt(s, probe)) assert repr(ve2no(sc.find_ge, probe)) == repr(slow_find_ge(s, probe)) assert repr(ve2no(sc.find_gt, probe)) == repr(slow_find_gt(s, probe)) for i, item in enumerate(s): assert repr(item) == repr(sc[i]) # test __getitem__
def __init__(self, dna): self.dna = dna self.byYield = [] self.D = {} # (yield, i) => set(Tuple) self.L = SortedCollection( key=lambda tup: tup.getStart()) # sorted by i
class InsertablesCollection(): def __init__(self, dna): self.dna = dna self.byYield = [] self.D = {} # (yield, i) => set(Tuple) self.L = SortedCollection( key=lambda tup: tup.getStart()) # sorted by i # Combinations of tuples, stored in a collection sorted by start, of # collections sorted by end, of non-overlapping combinations of Tuples. # Can be accessed with like: groups = C[start][end], which will return # an collection of all combinations of non-overlapping insertables between (start, end). #self.C = CombinationCollection() def __repr__(self): return self.D.values().__repr__() def beginStage(self): self.tuplesForStage = [] def addToStage(self, tup): self.tuplesForStage.append(tup) def getTuplesForStage(self): return self.tuplesForStage def filterStage(self, aFilter): self.tuplesForStage = aFilter.filter(self.tuplesForStage) def completeStage(self): self.addAll(self.tuplesForStage) self.tuplesForStage = None def addAll(self, tuples): for tup in tuples: self.add(tup) #if len(self.D.values()) > 12: # None print "\t\tAdded insertable:" + tup.derivation.toString(self.dna) def add(self, tup): (i, j, k, l) = tup.coordinates # add to byYield try: setByYield = self.byYield[tup.getYield()] except IndexError: setByYield = set() self.byYield.extend( [set() for x in range(len(self.byYield), tup.getYield())] ) # fill in any missing entries up to current tuple yield (which may never be used) self.byYield.append(setByYield) setByYield.add(tup) # add to D tupleSet = set() try: tupleSet = self.D[tup.getYield(), i] except KeyError: self.D[tup.getYield(), i] = tupleSet tupleSet.add(tup) # add to sorted list self.L.insert_right(tup) # return all tuples (i,j,k,l) such that i>=start and l<=end, *** sorted by i *** def getTuplesInSegment(self, start, end): tuples = [] if (len(self.L) == 0) or (start > self.L[-1].getStart()): return tuples left = self.L.find_ge(start) # tuple with lowest i >= start for tup in self.L[self.L.index(left):]: (i, j, k, l) = tup.coordinates if (tup.getStart() > end): return tuples if (tup.getEnd() > end): continue tuples.append(tup) return tuples # Return a list of all tuples def getAllTuples(self): tuples = [] for tupleSet in self.D.itervalues(): tuples.extend(tupleSet) return tuples # Return a list of tuples for the given yield def getTuplesOfYield(self, tYield): try: return list(self.byYield[tYield]) except IndexError: return [] # no parse entries for stage # Return a list of tuples for the given yield def getTuplesOfYieldAndI(self, tYield, i): if (tYield, i) in self.D: return self.D[tYield, i] else: return set() # Return a list of all combinations of non-overlapping insertables in region (start, end) def combinations(self, start, end): time0 = time.clock() segment = self.getTuplesInSegment(start, end) nonoverlapping = [] limit = min(stag13.Probabilities.MAX_INSERTIONS_PER_MIDDLE + 1, len(segment) + 1) for k in range(1, limit): for combination in itertools.combinations(segment, k): try: tup0 = None for tup in combination: if tup0 is None: tup0 = tup else: if tup.overlaps(tup0): raise ValueError() tup0 = tup nonoverlapping.append(combination) except ValueError: None #print "\t\t---> insertion combinations (%s..%s): %-3s %0.1e s" % ( # start, end, len(nonoverlapping), time.clock()-time0) return nonoverlapping
class nltk_tokenizer(IncrementalTransform): ''' a streamcorpus_pipeline IncrementalTransform that converts a chunk into a new chunk with Sentence objects generated using NLTK tokenizers ''' config_name = 'nltk_tokenizer' tagger_id = 'nltk_tokenizer' def __init__(self, *args, **kwargs): super(nltk_tokenizer, self).__init__(*args, **kwargs) self.sentence_tokenizer = PunktSentenceTokenizer() self.word_tokenizer = WhitespaceTokenizer() #PunktWordTokenizer() def _sentences(self, clean_visible): 'generate strings identified as sentences' previous_end = 0 clean_visible = clean_visible.decode('utf8') assert isinstance(clean_visible, unicode) for start, end in self.sentence_tokenizer.span_tokenize(clean_visible): ## no need to check start, because the first byte of text ## is always first byte of first sentence, and we will ## have already made the previous sentence longer on the ## end if there was an overlap. if start < previous_end: start = previous_end if start > end: ## skip this sentence... because it was eaten by ## an earlier sentence with a label continue try: label = self.label_index.find_le(end) except ValueError: label = None if label: off = label.offsets[OffsetType.BYTES] end = max(off.first + off.length, end) previous_end = end sent_str = clean_visible[start:end] yield start, end, sent_str def make_label_index(self, stream_item): 'make a sortedcollection on body.labels' labels = stream_item.body.labels.get(self.config.get('annotator_id')) if not labels: labels = [] self.label_index = SortedCollection( labels, key=lambda label: label.offsets[OffsetType.BYTES].first) def make_sentences(self, stream_item): 'assemble Sentence and Token objects' self.make_label_index(stream_item) sentences = [] token_num = 0 new_mention_id = 0 for sent_start, sent_end, sent_str in self._sentences( stream_item.body.clean_visible): assert isinstance(sent_str, unicode) sent = Sentence() sentence_pos = 0 for start, end in self.word_tokenizer.span_tokenize(sent_str): try: token_str = sent_str[start:end].encode('utf8') except Exception, exc: logger.critical("died on sent_str[%d:%d].encode('utf8')", start, end, exc_info=True) sys.exit('failed to cope with %r in %r' % (sent_str[start:end], sent_str)) tok = Token( token_num=token_num, token=token_str, sentence_pos=sentence_pos, ) tok.offsets[OffsetType.BYTES] = Offset( type=OffsetType.BYTES, first=sent_start + start, length=end - start, ) ## whitespace tokenizer will never get a token ## boundary in the middle of an 'author' label try: #logger.debug('searching for %d in %r', sent_start + start, self.label_index._keys) label = self.label_index.find_le(sent_start + start) except ValueError: label = None if label: off = label.offsets[OffsetType.BYTES] if off.first + off.length > sent_start + start: logger.info('overlapping label: %r' % label.target.target_id) ## overlaps streamcorpus.add_annotation(tok, label) assert label.annotator.annotator_id in tok.labels logger.info('adding label to tok: %r has %r', tok.token, label.target.target_id) if label in self.label_to_mention_id: mention_id = self.label_to_mention_id[label] else: mention_id = new_mention_id new_mention_id += 1 self.label_to_mention_id[label] = mention_id tok.mention_id = mention_id token_num += 1 sentence_pos += 1 sent.tokens.append(tok) sentences.append(sent) return sentences
class nltk_tokenizer(IncrementalTransform): """ a streamcorpus_pipeline IncrementalTransform that converts a chunk into a new chunk with Sentence objects generated using NLTK tokenizers """ tagger_id = "nltk_tokenizer" def __init__(self, config): self.config = config self.sentence_tokenizer = PunktSentenceTokenizer() self.word_tokenizer = WhitespaceTokenizer() # PunktWordTokenizer() def _sentences(self, clean_visible): "generate strings identified as sentences" previous_end = 0 clean_visible = clean_visible.decode("utf8") assert isinstance(clean_visible, unicode) for start, end in self.sentence_tokenizer.span_tokenize(clean_visible): ## no need to check start, because the first byte of text ## is always first byte of first sentence, and we will ## have already made the previous sentence longer on the ## end if there was an overlap. if start < previous_end: start = previous_end if start > end: ## skip this sentence... because it was eaten by ## an earlier sentence with a label continue try: label = self.label_index.find_le(end) except ValueError: label = None if label: off = label.offsets[OffsetType.BYTES] end = max(off.first + off.length, end) previous_end = end sent_str = clean_visible[start:end] yield start, end, sent_str def make_label_index(self, stream_item): "make a sortedcollection on body.labels" labels = stream_item.body.labels.get(self.config.get("annotator_id")) if not labels: labels = [] self.label_index = SortedCollection(labels, key=lambda label: label.offsets[OffsetType.BYTES].first) def make_sentences(self, stream_item): "assemble Sentence and Token objects" self.make_label_index(stream_item) sentences = [] token_num = 0 new_mention_id = 0 for sent_start, sent_end, sent_str in self._sentences(stream_item.body.clean_visible): assert isinstance(sent_str, unicode) sent = Sentence() sentence_pos = 0 for start, end in self.word_tokenizer.span_tokenize(sent_str): try: token_str = sent_str[start:end].encode("utf8") except Exception, exc: logger.critical("died on sent_str[%d:%d].encode('utf8')", start, end, exc_info=True) sys.exit("failed to cope with %r in %r" % (sent_str[start:end], sent_str)) tok = Token(token_num=token_num, token=token_str, sentence_pos=sentence_pos) tok.offsets[OffsetType.BYTES] = Offset( type=OffsetType.BYTES, first=sent_start + start, length=end - start ) ## whitespace tokenizer will never get a token ## boundary in the middle of an 'author' label try: # logger.debug('searching for %d in %r', sent_start + start, self.label_index._keys) label = self.label_index.find_le(sent_start + start) except ValueError: label = None if label: off = label.offsets[OffsetType.BYTES] if off.first + off.length > sent_start + start: logger.info("overlapping label: %r" % label.target.target_id) ## overlaps streamcorpus.add_annotation(tok, label) assert label.annotator.annotator_id in tok.labels logger.info("adding label to tok: %r has %r", tok.token, label.target.target_id) if label in self.label_to_mention_id: mention_id = self.label_to_mention_id[label] else: mention_id = new_mention_id new_mention_id += 1 self.label_to_mention_id[label] = mention_id tok.mention_id = mention_id token_num += 1 sentence_pos += 1 sent.tokens.append(tok) sentences.append(sent) return sentences
class CloseWordsGenerator(object): def __init__(self, correct_words, transmatrix=None, max_distance=3): self.__store_matrix(transmatrix) self.corrects = set(tuple(c for c in w) for w in correct_words) self.max_dist = max_distance def __store_matrix(self, m): """stores every row of the matrix in weighted order """ chars = set(m.iterkeys()) for inner_d in m.itervalues(): chars |= set(inner_d.itervalues()) d = {} for c1, row in m.iteritems(): values = [(c2, w) for c2, w in row.iteritems() if w > 1e-10] d[c1] = sorted(values, key=lambda x: x[1]) self.transitions = d def get_char_changes(self, word, src_char): row = self.transitions[src_char] values = [] for tgt, weight in row: new_words = set(gen_changed(word, src_char, tgt)) values.append((weight, new_words)) return sorted(values, key=lambda x: x[0]) def get_closest(self, word): """Computes closest word(s) based on stored transition matrix """ t = self.transitions chars = set(word) | set(['']) if word not in self.change_cache: self.change_cache[word] = [] for c in chars: if c not in t or len(t[c]) == 0: continue self.change_cache[word] += self.get_char_changes(word, c) if len(self.change_cache[word]) == 0: del self.change_cache[word] return None return self.change_cache[word].pop(0) def choose_next(self): if len(self.not_done) == 0: return return self.not_done[0] def __get_closest_for_seen(self): best = [None, set()] while len(best[1]) == 0: n = self.choose_next() if n is None: break word, (old_weight, old_dist) = n # skip if old_word is already as far as it can be if old_dist == self.max_dist: self.done.add(word) self.not_done.remove(self.not_done[0]) continue cl = self.get_closest(word) if cl is None: self.done.add(word) self.not_done.remove(self.not_done[0]) continue change_weight, new_words = cl new_weight = old_weight + change_weight if best[0] is None: best[0] = (new_weight, old_dist + 1) best[1] = new_words elif new_weight < best[0][0]: best[0] = (new_weight, old_dist + 1) best[1] = new_words elif new_weight == best[0][0]: best[1] |= new_words return best def get_closest_correct(self, word): word = tuple(c for c in word) # caching variables for speedup self.seen = {word: (0., 0)} self.change_cache = {} self.done = set() self.not_done = SortedCollection(key=lambda x: x[1][0]) self.not_done.insert(self.seen.items()[0]) while True: new_value, new_words = self.__get_closest_for_seen() if len(new_words) == 0: return None correct_words = new_words & self.corrects if len(correct_words) > 0: return correct_words for w in new_words: if w not in self.seen: self.seen[w] = new_value self.not_done.insert((w, new_value))
class Direction: ''' Represents data moving in one direction in a TCP flow. Members: * chunks = [tcp.Chunk], sorted by seq_start * flow = tcp.Flow, the flow to which the direction belongs * seq_start = the sequence number at which the data starts, after finish() * arrival_data = [(seq_num, pkt)] or SortedCollection * final_arrival_data = SortedCollection, after calculate_final_arrivals() ''' def __init__(self, flow): ''' Sets things up for adding packets. Args: flow = tcp.Flow ''' self.arrival_data = [] self.final_arrival_data = None # self.closed_cleanly = False # until proven true self.chunks = [] self.flow = flow # the seq number of the first byte of data, # valid after finish() if self.data is valid self.seq_start = None def add(self, pkt): ''' Merge the packet into the first chunk it overlaps with. If data was added to the end of a chunk, attempts to merge the next chunk (if there is one). This way, it is ensured that everything is as fully merged as it can be with the current data. Args: pkt = tcp.Packet ''' # discard packets with no payload. we don't care about them here if pkt.data == '': return # attempt to merge packet with existing chunks merged = False for i in range(len(self.chunks)): chunk = self.chunks[i] overlapped, result = chunk.merge(pkt, self.create_merge_callback(pkt)) if overlapped: # if the data overlapped # if data was added on the back and there is a chunk after this if result[1] and i < (len(self.chunks) - 1): # try to merge with the next chunk as well in case that packet # bridged the gap overlapped2, result2 = chunk.merge(self.chunks[i + 1]) if overlapped2: # if that merge worked # data should only be added to back assert ((not result2[0]) and (result2[1])) del self.chunks[i + 1] # remove the now-redundant chunk merged = True break # skip further chunks if not merged: # Nothing is overlapped with the packet. We need a new chunk. self.new_chunk(pkt) def finish(self): ''' Notifies the direction that there are no more packets coming. This means that self.data can be decided upon, and arrival_data can be converted to a SortedCollection for querying ''' # set data to the data from the first chunk, if there is one if self.chunks: self.data = self.chunks[0].data self.seq_start = self.chunks[0].seq_start else: self.data = '' self.arrival_data = SortedCollection(self.arrival_data, key=lambda v: v[0]) def calculate_final_arrivals(self): ''' make self.final_arrival_data a SortedCollection. Final arrival for a sequence number is when that sequence number of data and all the data before it have arrived, that is, when the data is usable by the application. Must be called after self.finish(). ''' self.final_arrival_data = [] peak_time = 0.0 # final arrival vertex always coincides with an arrival vertex for vertex in self.arrival_data: if vertex[1].ts > peak_time: peak_time = vertex[1].ts self.final_arrival_data.append((vertex[0], vertex[1].ts)) self.final_arrival_data = SortedCollection(self.final_arrival_data, key=lambda v: v[0]) def new_chunk(self, pkt): ''' creates a new tcp.Chunk for the pkt to live in. Only called if an attempt has been made to merge the packet with all existing chunks. ''' chunk = tcp.Chunk() chunk.merge(pkt, self.create_merge_callback(pkt)) self.chunks.append(chunk) self.sort_chunks() # it would be better to insert the packet sorted def sort_chunks(self): self.chunks.sort(key=lambda chunk: chunk.seq_start) def create_merge_callback(self, pkt): ''' Returns a function that will serve as a callback for Chunk. It will add the passed sequence number and the packet to self.arrival_data. ''' def callback(seq_num): self.arrival_data.append((seq_num, pkt)) return callback def byte_to_seq(self, byte): ''' Converts the passed byte index to a sequence number in the stream. byte is assumed to be zero-based. ''' if self.seq_start: return byte + self.seq_start else: return byte + self.flow.first_packet.seq def seq_arrival(self, seq_num): ''' returns the packet in which the specified sequence number first arrived. self.arrival_data must be a SortedCollection at this point; self.finish() must have been called. ''' if self.arrival_data: return self.arrival_data.find_le(seq_num)[1] def seq_final_arrival(self, seq_num): ''' Returns the time at which the seq number had fully arrived. Will calculate final_arrival_data if it has not been already. Only callable after self.finish() ''' if not self.final_arrival_data: self.calculate_final_arrivals() return self.final_arrival_data.find_le(seq_num)[1]
class Direction: ''' Represents data moving in one direction in a TCP flow. Members: * finished = bool. Indicates whether more packets should be expected. * chunks = [tcp.Chunk], sorted by seq_start * flow = tcp.Flow, the flow to which the direction belongs * arrival_data = SortedCollection([(seq_num, pkt)]) * final_arrival_data = SortedCollection([(seq_num, ts)]) * final_data_chunk = Chunk or None, the chunk that contains the final data, only after seq_start is valid * final_arrival_pointer = the end sequence number of data that has completely arrived ''' def __init__(self, flow): ''' Sets things up for adding packets. Args: flow = tcp.Flow ''' self.finished = False self.flow = flow self.arrival_data = SortedCollection(key=itemgetter(0)) self.final_arrival_data = SortedCollection(key=itemgetter(0)) self.final_arrival_pointer = None self.chunks = [] self.final_data_chunk = None def add(self, pkt): ''' Merge the packet into the first chunk it overlaps with. If data was added to the end of a chunk, attempts to merge the next chunk (if there is one). This way, it is ensured that everything is as fully merged as it can be with the current data. Args: pkt = tcp.Packet ''' if self.finished: raise RuntimeError('tried to add packets to a finished tcp.Direction') # discard packets with no payload. we don't care about them here if pkt.data == '': return # attempt to merge packet with existing chunks merged = False for i, chunk in enumerate(self.chunks): overlapped, (front, back) = chunk.merge(pkt, self.create_merge_callback(pkt)) if overlapped: # check if this packet bridged the gap between two chunks if back and i < (len(self.chunks)-1): overlapped2, result2 = chunk.merge(self.chunks[i+1]) if overlapped2: assert( (not result2[0]) and (result2[1])) del self.chunks[i+1] # if this is the main data chunk, calc final arrival if self.seq_start and chunk.seq_start == self.seq_start: if back: self.final_arrival_data.insert((self.final_arrival_pointer, pkt.ts)) if not self.final_data_chunk: self.final_data_chunk = chunk self.final_arrival_pointer = self.final_data_chunk.seq_end merged = True break # skip further chunks if not merged: # nothing overlapped with the packet # we need a new chunk self.new_chunk(pkt) @property def data(self): ''' returns the TCP data, as far as it has been determined. ''' if self.final_data_chunk: return self.final_data_chunk.data else: if self.finished: return '' # no data was ever added else: return None # just don't know at all @property def seq_start(self): ''' starting sequence number, as far as we can tell now. ''' if self.flow.handshake: if self is self.flow.fwd: return self.flow.handshake[2].seq elif self is self.flow.rev: return self.flow.handshake[1].seq + 1 else: raise RuntimeError( "holy crap, tcp.Direction has a flow it doesn't belong to") elif self.finished: if self.chunks: return self.chunks[0].seq_start else: log.warning('getting seq_start from finished tcp.Direction ' 'with no handshake and no data') return None else: return None def finish(self): ''' Notifies the direction that there are no more packets coming. This means that self.data can be decided upon. ''' self.finished = True if self.chunks and not self.final_data_chunk: self.final_data_chunk = self.chunks[0] def new_chunk(self, pkt): ''' creates a new tcp.Chunk for the pkt to live in. Only called if an attempt has been made to merge the packet with all existing chunks. ''' chunk = tcp.Chunk() chunk.merge(pkt, self.create_merge_callback(pkt)) if self.seq_start and chunk.seq_start == self.seq_start: self.final_data_chunk = chunk self.final_arrival_pointer = chunk.seq_end self.final_arrival_data.insert((pkt.seq, pkt.ts)) # it would be better to insert the packet sorted here self.chunks.append(chunk) self.chunks.sort(key=lambda chunk: chunk.seq_start) def create_merge_callback(self, pkt): ''' Returns a function that will serve as a callback for Chunk. It will add the passed sequence number and the packet to self.arrival_data. ''' def callback(seq_num): self.arrival_data.insert((seq_num, pkt)) return callback def byte_to_seq(self, byte): ''' Converts the passed byte index to a sequence number in the stream. byte is assumed to be zero-based. Returns None if seq_start is None ''' # TODO better handle case where seq_start is None seq_start = self.seq_start if seq_start is not None: return byte + seq_start else: return None def seq_arrival(self, seq_num): ''' returns the packet in which the specified sequence number first arrived. ''' return self.arrival_data.find_le(seq_num)[1] def seq_final_arrival(self, seq_num): ''' Returns the time at which the seq number had fully arrived, that is, when all the data before it had also arrived. ''' return self.final_arrival_data.find_le(seq_num)[1]
class Direction: ''' Represents data moving in one direction in a TCP flow. Members: * finished = bool. Indicates whether more packets should be expected. * chunks = [tcp.Chunk], sorted by seq_start * flow = tcp.Flow, the flow to which the direction belongs * arrival_data = SortedCollection([(seq_num, pkt)]) * final_arrival_data = SortedCollection([(seq_num, ts)]) * final_data_chunk = Chunk or None, the chunk that contains the final data, only after seq_start is valid * final_arrival_pointer = the end sequence number of data that has completely arrived ''' def __init__(self, flow): ''' Sets things up for adding packets. Args: flow = tcp.Flow ''' self.finished = False self.flow = flow self.arrival_data = SortedCollection(key=itemgetter(0)) self.final_arrival_data = SortedCollection(key=itemgetter(0)) self.final_arrival_pointer = None self.chunks = [] self.final_data_chunk = None def add(self, pkt): ''' Merge the packet into the first chunk it overlaps with. If data was added to the end of a chunk, attempts to merge the next chunk (if there is one). This way, it is ensured that everything is as fully merged as it can be with the current data. Args: pkt = tcp.Packet ''' if self.finished: raise RuntimeError('tried to add packets to a finished tcp.Direction') # discard packets with no payload. we don't care about them here if pkt.data == '': return # attempt to merge packet with existing chunks merged = False for i, chunk in enumerate(self.chunks): overlapped, (front, back) = chunk.merge(pkt, self.create_merge_callback(pkt)) if overlapped: # check if this packet bridged the gap between two chunks if back and i < (len(self.chunks)-1): overlapped2, result2 = chunk.merge(self.chunks[i+1]) if overlapped2: assert( (not result2[0]) and (result2[1])) del self.chunks[i+1] # if this is the main data chunk, calc final arrival if self.seq_start and chunk.seq_start == self.seq_start: if front: # packet was first in stream but just now arriving self.final_arrival_data.insert((self.seq_start, pkt.ts)) if back: # usual case self.final_arrival_data.insert((self.final_arrival_pointer, pkt.ts)) if not self.final_data_chunk: self.final_data_chunk = chunk self.final_arrival_pointer = self.final_data_chunk.seq_end merged = True break # skip further chunks if not merged: # nothing overlapped with the packet # we need a new chunk self.new_chunk(pkt) @property def data(self): ''' returns the TCP data, as far as it has been determined. ''' if self.final_data_chunk: return self.final_data_chunk.data else: if self.finished: return '' # no data was ever added else: return None # just don't know at all @property def seq_start(self): ''' starting sequence number, as far as we can tell now. ''' if self.flow.handshake: if self is self.flow.fwd: return self.flow.handshake[2].seq elif self is self.flow.rev: return self.flow.handshake[1].seq + 1 else: raise RuntimeError( "holy crap, tcp.Direction has a flow it doesn't belong to") elif self.finished: if self.chunks: return self.chunks[0].seq_start else: log.warning('getting seq_start from finished tcp.Direction ' 'with no handshake and no data') return None else: return None def finish(self): ''' Notifies the direction that there are no more packets coming. This means that self.data can be decided upon. Also calculates final_arrival for any packets that arrived while seq_start was None ''' self.finished = True # calculate final_arrival if not self.final_arrival_data: peak_time = 0.0 for vertex in self.arrival_data: if vertex[1].ts > peak_time: peak_time = vertex[1].ts self.final_arrival_data.insert((vertex[0], vertex[1].ts)) if self.chunks and not self.final_data_chunk: self.final_data_chunk = self.chunks[0] def new_chunk(self, pkt): ''' creates a new tcp.Chunk for the pkt to live in. Only called if an attempt has been made to merge the packet with all existing chunks. ''' chunk = tcp.Chunk() chunk.merge(pkt, self.create_merge_callback(pkt)) if self.seq_start and chunk.seq_start == self.seq_start: self.final_data_chunk = chunk self.final_arrival_pointer = chunk.seq_end self.final_arrival_data.insert((pkt.seq, pkt.ts)) # it would be better to insert the chunk sorted here self.chunks.append(chunk) self.chunks.sort(key=lambda chunk: chunk.seq_start) def create_merge_callback(self, pkt): ''' Returns a function that will serve as a callback for Chunk. It will add the passed sequence number and the packet to self.arrival_data. ''' def callback(seq_num): self.arrival_data.insert((seq_num, pkt)) return callback def byte_to_seq(self, byte): ''' Converts the passed byte index to a sequence number in the stream. byte is assumed to be zero-based. Returns None if seq_start is None ''' # TODO better handle case where seq_start is None seq_start = self.seq_start if seq_start is not None: return byte + seq_start else: return None def seq_arrival(self, seq_num): ''' returns the packet in which the specified sequence number first arrived. ''' try: return self.arrival_data.find_le(seq_num)[1] except ValueError: return None def seq_final_arrival(self, seq_num): ''' Returns the time at which the seq number had fully arrived, that is, when all the data before it had also arrived. ''' try: return self.final_arrival_data.find_le(seq_num)[1] except: return None
class KeyframeSceneTemplate(SceneTemplate): class KeyframePlayableScene(PlayableScene): def __init__(self,scene_template,**kwargs): self.colors = kwargs.pop('colors') PlayableScene.__init__(self,**kwargs) self.scene_template = scene_template self.seconds_in_scene = 0 self.beats_in_scene = 0 self._prev_frame = None # The previous frame before 'now' self._next_frame = None # The next frame after 'now' self._prev_at = None # The time in seconds at which the previous frame occurred self._next_at = None # The time in seconds at which the next frame will occur self._prev_rgb = None # [(0,0,0)]*PANEL_NUM self._next_rgb = None #[(0,0,0)]*PANEL_NUM def step(self,seconds): if not PlayableScene.step(self,seconds): # If the nothing happened in this step, then don't bother with calc # this probably just means that seconds = 0 return False self.seconds_in_scene += seconds self.beats_in_scene += self.beats_in_step if self.beats_in_scene >= self.scene_template.scene_length: # TODO: Determine if this should be >= or > self.beats_in_scene = self.beats_in_scene % self.scene_template.scene_length self.seconds_in_scene = self.seconds_in_scene % (self.scene_template.scene_length*self.spb()) return True # Remember to return True or nothing happens! def rgb(self): if not self._prev_frame or not self._next_frame or self.beats() != 0: old_prev_frame = self._prev_frame old_next_frame = self._next_frame looped_forwards_to_beginning = False looped_backwards_to_end = False try: self._prev_frame = self.scene_template._keyframes.find_le(self.beats_in_scene) except ValueError: # If we can't find a key less than this beat then this is the beginning # so use the last frame self._prev_frame = self.scene_template._keyframes[-1] looped_backwards_to_end = True try: self._next_frame = self.scene_template._keyframes.find_gt(self.beats_in_scene) except ValueError: # If we can't find a key great than this beat then this is the end so # use the first frame self._next_frame = self.scene_template._keyframes[0] looped_forwards_to_beginning = True # If the keyframes have changed, recalculate rgb values if old_prev_frame != self._prev_frame or old_next_frame != self._next_frame: if looped_backwards_to_end: # If we have to loop around to the end to get the 'previous' keyframe # then do calculations with that in mind from_end = self.scene_template.scene_length*self.spb()-self._prev_frame[0]*self.spb() self._prev_at = -(from_end) else: # Otherwise the previous time of the keyframe is whatever beat that keyframe is on times secs/beat self._prev_at = self._prev_frame[0]*self.spb() if looped_forwards_to_beginning: from_begin = self._next_frame[0]*self.spb() self._next_at = self.scene_template.scene_length*self.spb() + from_begin else: self._next_at = self._next_frame[0]*self.spb() assert(self._prev_at < self.seconds_in_scene) assert(self._next_at > self.seconds_in_scene) self._prev_rgb = [self.colors[x] for x in self._prev_frame[1].panel_colors] self._next_rgb = [self.colors[x] for x in self._next_frame[1].panel_colors] #print 'Using new keyframe' #print self._prev_rgb #print self._next_rgb else: pass #print 'Using same keyframe' # If the next keyframe is a 'switch to' frame if isinstance(self._next_frame[1],KeyframeSceneTemplate.SwitchToFrame): # then just play the previously calculated rgb return self._prev_rgb # Just play the previous elif isinstance(self._next_frame[1],KeyframeSceneTemplate.FadeToFrame): # Calculate the rgb for 'now' # seconds after the last keyframe OVER total seconds between keyframe progress = (self.seconds_in_scene - self._prev_at) / (self._next_at - self._prev_at) #print "We are %s%% through this area between keyframes at %s and %s" % ((progress*100),self._prev_at,self._next_at) assert (0 < progress < 1) now_rgb = [] # Look at each panel for i in xrange(PANEL_NUM): now_rgb.append( \ map(lambda a,b: int((b-a)*progress+a), self._prev_rgb[i],self._next_rgb[i]), # (self._next_rgb[i] - self._prev_rgb[i])*progress) + self._prev_rgb[i] \ ) return now_rgb else: raise ValueError, 'Unknown frame type encountered %s at %s' % (self._next_frame[1],self._next_frame[0]) def __init__(self,scene_length): self._keyframes = SortedCollection(key=itemgetter(0)) self.scene_length = scene_length def set_wall_frame(self,at_beat,panel_color,fade_to=True): return self.set_frame(at_beat,[panel_color]*PANEL_NUM,fade_to) def set_frame(self,at_beat,panel_colors,fade_to=True): """`panel_colors` is either a list which is the same length as the number of panels in the lightwall and whose content is an integer 'color id' OR it is a dictionary with keys that are integer values no greater than the number of panels in the lightwall and whose value is a integer 'color id'. e.g., [3,1,None,4,...] is equivalent to {1:3,2:1,4:4} # If there is already a keyframe at the given beat, it is overwritten """ if type(at_beat) != int: print "Warning: settings non-integer beat" if type(panel_colors) == dict: l = [] for i in xrange(PANEL_NUM): l.append(panel_colors.get(i,None)) panel_colors = l if at_beat < 0: raise ValueError,"Tried to add a keyframe before beginning of scene (<0)" elif at_beat > self.scene_length: raise ValueError, "Tried to add a keyframe after end of scene (>scene_length)" try: # try to remove a keyframe at that beat because we want to # overwrite it if there's one there self._keyframes.remove(self._keyframes.find(at_beat)) except ValueError: pass if fade_to: self._keyframes.insert((at_beat,KeyframeSceneTemplate.FadeToFrame(panel_colors))) else: self._keyframes.insert((at_beat,KeyframeSceneTemplate.SwitchToFrame(panel_colors))) def remove_frame(self,at_beat): self._keyframes.remove(self._keyframes.find(at_beat)) def bind(self,scene_data): return KeyframeSceneTemplate.KeyframePlayableScene(self,**scene_data) class Frame(object): def __init__(self,panel_colors): self.panel_colors = panel_colors class FadeToFrame(Frame): pass class SwitchToFrame(Frame): pass
def __init__(self,scene_length): self._keyframes = SortedCollection(key=itemgetter(0)) self.scene_length = scene_length
class nltk_tokenizer(IncrementalTransform): ''' a streamcorpus_pipeline IncrementalTransform that converts a chunk into a new chunk with Sentence objects generated using NLTK tokenizers ''' config_name = 'nltk_tokenizer' tagger_id = 'nltk_tokenizer' def __init__(self, *args, **kwargs): super(nltk_tokenizer, self).__init__(*args, **kwargs) self.sentence_tokenizer = PunktSentenceTokenizer() self.word_tokenizer = WhitespaceTokenizer() #PunktWordTokenizer() def _sentences(self, clean_visible): 'generate strings identified as sentences' previous_end = 0 clean_visible = clean_visible.decode('utf8') assert isinstance(clean_visible, unicode) for start, end in self.sentence_tokenizer.span_tokenize(clean_visible): ## no need to check start, because the first byte of text ## is always first byte of first sentence, and we will ## have already made the previous sentence longer on the ## end if there was an overlap. if start < previous_end: start = previous_end if start > end: ## skip this sentence... because it was eaten by ## an earlier sentence with a label continue try: label = self.label_index.find_le(end) except ValueError: label = None if label: off = label.offsets[OffsetType.BYTES] end = max(off.first + off.length, end) previous_end = end sent_str = clean_visible[start:end] yield start, end, sent_str def make_label_index(self, stream_item): 'make a sortedcollection on body.labels' labels = stream_item.body.labels.get(self.config.get('annotator_id')) if not labels: labels = [] self.label_index = SortedCollection( labels, key=lambda label: label.offsets[OffsetType.BYTES].first) def make_sentences(self, stream_item): 'assemble Sentence and Token objects' self.make_label_index(stream_item) sentences = [] token_num = 0 new_mention_id = 0 for sent_start, sent_end, sent_str in self._sentences(stream_item.body.clean_visible): assert isinstance(sent_str, unicode) sent = Sentence() sentence_pos = 0 for start, end in self.word_tokenizer.span_tokenize(sent_str): token_str = sent_str[start:end].encode('utf8') tok = Token( token_num=token_num, token=token_str, sentence_pos=sentence_pos, ) tok.offsets[OffsetType.BYTES] = Offset( type=OffsetType.BYTES, first=sent_start + start, length = end - start, ) ## whitespace tokenizer will never get a token ## boundary in the middle of an 'author' label try: #logger.debug('searching for %d in %r', sent_start + start, self.label_index._keys) label = self.label_index.find_le(sent_start + start) except ValueError: label = None if label: off = label.offsets[OffsetType.BYTES] if off.first + off.length > sent_start + start: logger.info('overlapping label: %r' % label.target.target_id) ## overlaps streamcorpus.add_annotation(tok, label) assert label.annotator.annotator_id in tok.labels logger.info('adding label to tok: %r has %r', tok.token, label.target.target_id) if label in self.label_to_mention_id: mention_id = self.label_to_mention_id[label] else: mention_id = new_mention_id new_mention_id += 1 self.label_to_mention_id[label] = mention_id tok.mention_id = mention_id token_num += 1 sentence_pos += 1 sent.tokens.append(tok) sentences.append(sent) return sentences def process_item(self, stream_item, context=None): if not hasattr(stream_item.body, 'clean_visible') or not stream_item.body.clean_visible: return stream_item self.label_index = None self.label_to_mention_id = dict() stream_item.body.sentences[self.tagger_id] = self.make_sentences(stream_item) return stream_item def __call__(self, stream_item, context=None): ## support the legacy callable API return self.process_item(stream_item, context)