def __iter__(self): """ Iterator over StreamItems in the chunk """ assert self._chunk_fh, "cannot iterate over stream_items in an empty Chunk" ## seek to the start, so can iterate multiple times over the chunk self._chunk_fh.seek(0) ## wrap the file handle in buffered transport i_transport = TTransport.TBufferedTransport(self._chunk_fh) ## use the Thrift Binary Protocol i_protocol = TBinaryProtocol.TBinaryProtocol(i_transport) ## read StreamItem instances until input buffer is exhausted while 1: ## instantiate a StreamItem instance doc = StreamItem() try: ## read it from the thrift protocol instance doc.read(i_protocol) ## yield is python primitive for iteration yield doc except EOFError: break
def __iter__(self): ''' Iterator over StreamItems in the chunk ''' assert self._chunk_fh, 'cannot iterate over stream_items in an empty Chunk' ## seek to the start, so can iterate multiple times over the chunk self._chunk_fh.seek(0) ## wrap the file handle in buffered transport i_transport = TTransport.TBufferedTransport(self._chunk_fh) ## use the Thrift Binary Protocol i_protocol = TBinaryProtocol.TBinaryProtocol(i_transport) ## read StreamItem instances until input buffer is exhausted while 1: ## instantiate a StreamItem instance doc = StreamItem() try: ## read it from the thrift protocol instance doc.read(i_protocol) ## yield is python primitive for iteration yield doc except EOFError: break
def make_stream_item(zulu_timestamp, abs_url): """ Assemble a minimal StreamItem with internally consistent .stream_time.zulu_timestamp, .stream_time.epoch_ticks, .abs_url, .doc_id, and .stream_id """ st = make_stream_time(zulu_timestamp) si = StreamItem() si.stream_time = st si.abs_url = abs_url si.doc_id = hashlib.md5(abs_url).hexdigest() si.stream_id = "%d-%s" % (st.epoch_ticks, si.doc_id) return si
def make_stream_item(zulu_timestamp, abs_url): ''' Assemble a minimal StreamItem with internally consistent .stream_time.zulu_timestamp, .stream_time.epoch_ticks, .abs_url, .doc_id, and .stream_id ''' st = make_stream_time(zulu_timestamp) si = StreamItem() si.stream_time = st si.abs_url = abs_url si.doc_id = hashlib.md5(abs_url).hexdigest() si.stream_id = '%d-%s' % (st.epoch_ticks, si.doc_id) return si