def test_verify_low_seed_tiles(self): from tilequeue.tile import coord_int_zoom_up from tilequeue.tile import coord_marshall_int from tilequeue.tile import seed_tiles seed_coords = seed_tiles(1, 5) for coord in seed_coords: coord_int = coord_marshall_int(coord) parent_coord = coord.zoomTo(coord.zoom - 1).container() exp_int = coord_marshall_int(parent_coord) act_int = coord_int_zoom_up(coord_int) self.assertEquals(exp_int, act_int)
def __call__(self, coords): # this will produce the intersected list of coordinates with the toi, # all the way to low zoom level tiles intersected_coords, intersect_metrics, timing = \ self.toi_intersector(coords) low_zoom_coord_ints = set() grouped_by_zoom = defaultdict(list) for coord in intersected_coords: if self.group_by_zoom <= coord.zoom: parent = coord.zoomTo(self.group_by_zoom).container() parent_coord_int = coord_marshall_int(parent) grouped_by_zoom[parent_coord_int].append(coord) else: coord_int = coord_marshall_int(coord) low_zoom_coord_ints.add(coord_int) n_coords = 0 payloads = [] for _, coords in grouped_by_zoom.iteritems(): payload = self.msg_marshaller.marshall(coords) payloads.append(payload) n_coords += len(coords) # add all low zooms into a single payload low_zoom_coords = [] for coord_int in low_zoom_coord_ints: coord = coord_unmarshall_int(coord_int) low_zoom_coords.append(coord) if low_zoom_coords: low_zoom_payload = self.msg_marshaller.marshall(low_zoom_coords) payloads.append(low_zoom_payload) n_payloads = len(payloads) rawr_queue_batch_size = 10 n_msgs_sent = 0 for payloads_chunk in grouper(payloads, rawr_queue_batch_size): self.rawr_queue.send(payloads_chunk, self.logger) n_msgs_sent += 1 if self.logger: self.logger.info( 'Rawr tiles enqueued: ' 'coords(%d) payloads(%d) enqueue_calls(%d) ' 'toi(%d) hits(%d) misses(%d)' % (n_coords, n_payloads, n_msgs_sent, intersect_metrics['n_toi'], intersect_metrics['hits'], intersect_metrics['misses'])) self.stats_handler(n_coords, n_payloads, n_msgs_sent, intersect_metrics, timing)
def write_coords_redis_protocol(self, out, set_key, coords): # coords is expected to be an iterable of coord objects # this is meant to be called with out sent to stdout and then piped to # redis-cli --pipe key_len = len(set_key) for coord in coords: coord_int = coord_marshall_int(coord) # http://redis.io/topics/protocol # An attempt was made to send over integers directly via the # protocol, but it looks like redis wants strings. It seems like it # ends up storing the strings as integers anyway. coord_int = str(coord_int) val_len = len(coord_int) coord_insert = ('*3\r\n' '$4\r\nSADD\r\n' '$%(key_len)d\r\n%(key)s\r\n' '$%(val_len)d\r\n%(val)s\r\n' % dict( key_len=key_len, val_len=val_len, key=set_key, val=coord_int, )) out.write(coord_insert)
def __call__(self, coords): timing = dict( fetch=0, intersect=0, ) with time_block(timing, 'intersect'): all_coord_ints = set() for coord in coords: while coord.zoom >= self.zoom_stop_inclusive: coord_int = coord_marshall_int(coord) if coord_int in all_coord_ints: # as an optimization, assume that if the coord is # already in the set, then all its parents will be too break all_coord_ints.add(coord_int) coord = coord.zoomBy(-1).container() coords = imap(coord_unmarshall_int, all_coord_ints) metrics = dict( total=len(all_coord_ints), hits=len(all_coord_ints), misses=0, n_toi=0, cached=False, ) return coords, metrics, timing
def intersect(self, coords, tiles_of_interest=None): if tiles_of_interest is None: tiles_of_interest = self.fetch_tiles_of_interest() for coord in coords: serialized_coord = coord_marshall_int(coord) if serialized_coord in tiles_of_interest: yield coord
def test_enqueue_should_check_if_pending_work(self): from tilequeue.tile import coord_marshall_int coord = Coordinate(row=1, column=1, zoom=1) self.sqs.enqueue(coord) exp_value = coord_marshall_int(coord) self.mockRedis.sismember.assert_called_once_with( self.sqs.inflight_key, exp_value)
def enqueue_batch(self, coords): buf_per_queue = {} n_queued = 0 n_in_flight = 0 for coord in coords: # TODO log? if not coord_is_valid(coord): continue coord_int = coord_marshall_int(coord) if self._inflight(coord_int): n_in_flight += 1 else: n_queued += 1 sqs_queue_name = self.get_queue_name_for_zoom(coord.zoom) queue_buf = buf_per_queue.setdefault(sqs_queue_name, []) queue_buf.append((coord, coord_int)) if len(queue_buf) == self.queue_buf_size: sqs_queue = self.sqs_queue_for_name.get(sqs_queue_name) assert sqs_queue_name, \ 'Missing queue for: %s' % sqs_queue_name self._write_batch(sqs_queue, queue_buf) del queue_buf[:] for queue_name, queue_buf in buf_per_queue.items(): if queue_buf: sqs_queue = self.sqs_queue_for_name.get(queue_name) assert sqs_queue, 'Missing queue for: %s' % queue_name self._write_batch(sqs_queue, queue_buf) return n_queued, n_in_flight
def test_enqueue_should_check_if_pending_work(self): from tilequeue.tile import coord_marshall_int coord = Coordinate(row=1, column=1, zoom=1) self.sqs.enqueue(coord) exp_value = coord_marshall_int(coord) self.mockRedis.sismember.assert_called_once_with(self.sqs.inflight_key, exp_value)
def test_verify_examples(self): from ModestMaps.Core import Coordinate from tilequeue.tile import coord_int_zoom_up from tilequeue.tile import coord_marshall_int test_coords = ( Coordinate(zoom=20, column=1002463, row=312816), Coordinate(zoom=20, column=(2 ** 20)-1, row=(2 ** 20)-1), Coordinate(zoom=10, column=(2 ** 10)-1, row=(2 ** 10)-1), Coordinate(zoom=5, column=20, row=20), Coordinate(zoom=1, column=0, row=0), ) for coord in test_coords: coord_int = coord_marshall_int(coord) parent_coord = coord.zoomTo(coord.zoom - 1).container() exp_int = coord_marshall_int(parent_coord) act_int = coord_int_zoom_up(coord_int) self.assertEquals(exp_int, act_int)
def load_set_from_fp(fp): toi_set = set() for coord_str in fp: coord = deserialize_coord(coord_str) coord_int = coord_marshall_int(coord) toi_set.add(coord_int) return toi_set
def coord_ints_from_paths(paths): coord_set = set() for path in paths: with open(path) as fp: coords = create_coords_generator_from_tiles_file(fp) for coord in coords: coord_int = coord_marshall_int(coord) coord_set.add(coord_int) return coord_set
def test_enqueue_adds_tile_as_in_flight(self): self.mockRedis.sismember = MagicMock(return_value=False) mock = MagicMock() self.mockRedis.sadd = mock coord = Coordinate(row=1, column=1, zoom=1) self.sqs.enqueue(coord) from tilequeue.tile import coord_marshall_int exp_value = coord_marshall_int(coord) self.mockRedis.sadd.assert_called_once_with(self.sqs.inflight_key, exp_value)
def test_tilequeue_explode_and_intersect(self): from tilequeue.command import explode_and_intersect from tilequeue.tile import coord_marshall_int from tilequeue.tile import coord_unmarshall_int from ModestMaps.Core import Coordinate sample_coord = Coordinate(zoom=14, column=250, row=250) sample_coord_int = coord_marshall_int(sample_coord) tiles_of_interest = [sample_coord_int] for i in (10, 11, 12, 13): coord = sample_coord.zoomTo(i) coord_int = coord_marshall_int(coord) tiles_of_interest.append(coord_int) exploded = explode_and_intersect([sample_coord_int], tiles_of_interest, until=11) coord_ints = list(exploded) for coord_int in coord_ints: coord = coord_unmarshall_int(coord_int) self.failUnless(coord.zoom > 10) self.assertEqual(4, len(coord_ints))
def index_coords(self, coords): batch_size = 100 buf = [] for coord in coords: redis_coord_value = coord_marshall_int(coord) buf.append(redis_coord_value) if len(buf) >= batch_size: self.redis_client.sadd(self.cache_set_key, *buf) del buf[:] if buf: self.redis_client.sadd(self.cache_set_key, *buf)
def _write_batch(self, coords): assert len(coords) <= 10 values = [] msg_tuples = [] for i, coord in enumerate(coords): msg_tuples.append((str(i), serialize_coord(coord), 0)) values.append(coord_marshall_int(coord)) self.sqs_queue.write_batch(msg_tuples) self.redis_client.sadd(self.inflight_key, *values)
def test_job_done_removes_tile_from_in_flight(self): from tilequeue.tile import CoordMessage coord = Coordinate(row=1, column=1, zoom=1) payload = serialize_coord(coord) message = RawMessage() message.set_body(payload) coord_message = CoordMessage(coord, message) self.sqs.job_done(coord_message) from tilequeue.tile import coord_marshall_int exp_value = coord_marshall_int(coord) self.mockRedis.srem.assert_called_once_with(self.sqs.inflight_key, exp_value)
def enqueue_batch(self, coords): n = 0 coord_buffer = [] for coord in coords: coord_int = coord_marshall_int(coord) coord_buffer.append(coord_int) n += 1 if len(coord_buffer) >= self.enqueue_batch_size: self.redis_client.rpush(self.queue_key, *coord_buffer) del coord_buffer[:] if coord_buffer: self.redis_client.rpush(self.queue_key, *coord_buffer) return n, 0
def enqueue(self, coord): if not coord_is_valid(coord): # TODO log? return coord_int = coord_marshall_int(coord) if not self._inflight(coord): payload = serialize_coord(coord) message = RawMessage() message.set_body(payload) sqs_queue_name = self.get_queue_name_for_zoom(coord.zoom) sqs_queue = self.sqs_queue_for_name.get(sqs_queue_name) assert sqs_queue, 'No queue found for: %s' % sqs_queue_name sqs_queue.write(message) self._add_to_flight(coord_int)
def group(self, coords): """return CoordGroups that can be used to send to queues Each CoordGroup represents a message that can be sent to a particular queue, stamped with the queue_id. The list of coords, which can be 1, is what should get used for the payload for each queue message. """ groups = [] for i in range(len(self.zoom_range_items)): groups.append([]) # first group the coordinates based on their queue for coord in coords: for i, zri in enumerate(self.zoom_range_items): toi_match = zri.in_toi is None or \ (coord in self.toi_set) == zri.in_toi if zri.start <= coord.zoom < zri.end and toi_match: groups[i].append(coord) break # now, we need to just verify that for each particular group, # should they be further grouped, eg by a particular zoom 10 # tile for i, zri in enumerate(self.zoom_range_items): group = groups[i] if not group: continue if zri.group_by_zoom is None: for coord in group: yield CoordGroup([coord], zri.queue_id) else: by_parent_coords = defaultdict(list) for coord in group: if coord.zoom >= zri.group_by_zoom: group_coord = coord.zoomTo(zri.group_by_zoom) group_key = coord_marshall_int(group_coord) by_parent_coords[group_key].append(coord) else: # this means that a coordinate belonged to a # particular queue but the zoom was lower than # the group by zoom # this probably shouldn't happen # should it be an assert instead? yield CoordGroup([coord], zri.queue_id) for group_key, coords in by_parent_coords.iteritems(): yield CoordGroup(coords, zri.queue_id)
def test_roundtrip_serialization(self): from tilequeue.tile import coord_marshall_int from tilequeue.tile import coord_unmarshall_int from tilequeue.tile import seed_tiles from ModestMaps.Core import Coordinate from itertools import chain seed_coords = seed_tiles(0, 5) example_coords = [ Coordinate(zoom=20, column=1002463, row=312816), Coordinate(zoom=30, column=12345678, row=12345678), ] coords = chain(seed_coords, example_coords) for coord in coords: self.assertEquals(coord, coord_unmarshall_int(coord_marshall_int(coord)))
def test_tilequeue_explode_and_intersect(self): from tilequeue.command import explode_and_intersect from tilequeue.tile import coord_marshall_int from tilequeue.tile import coord_unmarshall_int from ModestMaps.Core import Coordinate sample_coord = Coordinate(zoom=14, column=250, row=250) sample_coord_int = coord_marshall_int(sample_coord) tiles_of_interest = [sample_coord_int] for i in (10, 11, 12, 13): coord = sample_coord.zoomTo(i) coord_int = coord_marshall_int(coord) tiles_of_interest.append(coord_int) exploded, metrics = explode_and_intersect( [sample_coord_int], tiles_of_interest, until=11) coord_ints = list(exploded) for coord_int in coord_ints: coord = coord_unmarshall_int(coord_int) self.failUnless(coord.zoom > 10) self.assertEqual(4, len(coord_ints)) self.assertEqual(4, metrics['hits']) self.assertEqual(0, metrics['misses']) self.assertEqual(4, metrics['total'])
def job_done(self, coord_message): queue_name = None if coord_message.metadata: queue_name = coord_message.metadata.get('queue_name') assert queue_name, \ 'Missing queue name metadata for coord: %s' % serialize_coord( coord_message.coord) sqs_queue = self.sqs_queue_for_name.get(queue_name) assert sqs_queue, 'Missing queue for: %s' % queue_name coord_int = coord_marshall_int(coord_message.coord) self.redis_client.srem(self.inflight_key, coord_int) sqs_queue.delete_message(coord_message.message_handle)
def parse_log_file(log_file): ip_pattern = r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # didn't match againts explicit date pattern, in case it changes date_pattern = r'\[([\d\w\s\/:]+)\]' tile_id_pattern = r'\/([\w]+)\/([\d]+)\/([\d]+)\/([\d]+)\.([\d\w]*)' log_pattern = r'%s - - %s "([\w]+) %s.*' % ( ip_pattern, date_pattern, tile_id_pattern) tile_log_records = [] for log_string in log_file: match = re.search(log_pattern, log_string) if match and len(match.groups()) == 8: tile_log_records.append( (match.group(1), datetime.strptime(match.group(2), '%d/%B/%Y %H:%M:%S'), coord_marshall_int( create_coord( match.group(6), match.group(7), match.group(5))))) return tile_log_records
def parse_log_file(log_file): ip_pattern = '(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # didn't match againts explicit date pattern, in case it changes date_pattern = '\[([\d\w\s\/:]+)\]' tile_id_pattern = '\/([\w]+)\/([\d]+)\/([\d]+)\/([\d]+)\.([\d\w]*)' log_pattern = '%s - - %s "([\w]+) %s.*' % (ip_pattern, date_pattern, tile_id_pattern) tile_log_records = [] for log_string in log_file: match = re.search(log_pattern, log_string) if match and len(match.groups()) == 8: tile_log_records.append((match.group(1), datetime.strptime(match.group(2), '%d/%B/%Y %H:%M:%S'), coord_marshall_int( create_coord(match.group(6), match.group(7), match.group(5))))) return tile_log_records
def is_inflight(self, coord): coord_int = coord_marshall_int(coord) return self.redis_client.sismember(self.inflight_key, coord_int)
def coord_int_at_mercator_point(z, x, y): coord = mercator_point_to_coord(z, x, y) coord_int = coord_marshall_int(coord) return coord_int
def __call__(self, stop): saw_sentinel = False while not stop.is_set(): try: data = self.input_queue.get(timeout=timeout_seconds) except Queue.Empty: continue if data is None: saw_sentinel = True break coord = data['coord'] start = time.time() try: fetch_data = self.fetcher(coord) except: exc_type, exc_value, exc_traceback = sys.exc_info() stacktrace = format_stacktrace_one_line( (exc_type, exc_value, exc_traceback)) if isinstance(exc_value, TransactionRollbackError): log_level = logging.WARNING else: log_level = logging.ERROR self.logger.log(log_level, 'Error fetching: %s - %s' % ( serialize_coord(coord), stacktrace)) continue metadata = data['metadata'] metadata['timing']['fetch_seconds'] = time.time() - start # if we are at zoom level 16, it will serve as a metatile # to derive the tiles underneath it cut_coords = None if coord.zoom == 16: cut_coords = [] async_jobs = [] children_until = 20 # ask redis if there are any tiles underneath in the # tiles of interest set rci = self.redis_cache_index async_fn = rci.is_coord_int_in_tiles_of_interest for child in coord_children_range(coord, children_until): zoomed_coord_int = coord_marshall_int(child) async_result = self.io_pool.apply_async( async_fn, (zoomed_coord_int,)) async_jobs.append((child, async_result)) async_exc_info = None for async_job in async_jobs: zoomed_coord, async_result = async_job try: is_coord_in_tiles_of_interest = async_result.get() except: async_exc_info = sys.exc_info() stacktrace = format_stacktrace_one_line(async_exc_info) self.logger.error(stacktrace) else: if is_coord_in_tiles_of_interest: cut_coords.append(zoomed_coord) if async_exc_info: continue data = dict( metadata=metadata, coord=coord, feature_layers=fetch_data['feature_layers'], unpadded_bounds=fetch_data['unpadded_bounds'], cut_coords=cut_coords, ) while not _non_blocking_put(self.output_queue, data): if stop.is_set(): break if not saw_sentinel: _force_empty_queue(self.input_queue) self.logger.debug('data fetch stopped')
def intersect(coords, tiles_of_interest=None): for coord in coords: serialized_coord = coord_marshall_int(coord) if serialized_coord in tiles_of_interest: yield coord
def enqueue(self, coord): coord_int = coord_marshall_int(coord) self.redis_client.rpush(coord_int)
def _inflight(self, coord): return (not self.is_seeding) and self.redis_client.sismember( self.inflight_key, coord_marshall_int(coord))
def convert_to_coord_ints(coords): for coord in coords: coord_int = coord_marshall_int(coord) yield coord_int
def _add_to_flight(self, coord): self.redis_client.sadd(self.inflight_key, coord_marshall_int(coord))
def job_done(self, coord_message): coord_int = coord_marshall_int(coord_message.coord) self.redis_client.srem(self.inflight_key, coord_int) self.sqs_queue.delete_message(coord_message.message_handle)
def unmark_inflight(self, coord): coord_int = coord_marshall_int(coord) self.redis_client.srem(self.inflight_key, coord_int)