def batch_get_entity(self, table_name, row_keys, column_names): """ Takes in batches of keys and retrieves their corresponding rows. Args: table_name: The table to access row_keys: A list of keys to access column_names: A list of columns to access Returns: A dictionary of rows and columns/values of those rows. The format looks like such: {key:{column_name:value,...}} Raises: TypeError: If an argument passed in was not of the expected type. AppScaleDBConnectionError: If the batch_get could not be performed due to an error with Cassandra. """ if not isinstance(table_name, str): raise TypeError("Expected a str") if not isinstance(column_names, list): raise TypeError("Expected a list") if not isinstance(row_keys, list): raise TypeError("Expected a list") row_keys_bytes = [bytearray(row_key) for row_key in row_keys] statement = 'SELECT * FROM "{table}" '\ 'WHERE {key} IN %s and {column} IN %s'.format( table=table_name, key=ThriftColumn.KEY, column=ThriftColumn.COLUMN_NAME, ) query = SimpleStatement(statement, retry_policy=BASIC_RETRIES) results = [] # Split the rows up into chunks to reduce the likelihood of timeouts. chunk_indexes = [ (n, n + ENTITY_FETCH_THRESHOLD) for n in xrange(0, len(row_keys_bytes), ENTITY_FETCH_THRESHOLD) ] # TODO: This can be made more efficient by maintaining a constant number # of concurrent requests rather than waiting for each batch to complete. for start, end in chunk_indexes: parameters = (ValueSequence(row_keys_bytes[start:end]), ValueSequence(column_names)) try: batch_results = yield self.tornado_cassandra.execute( query, parameters=parameters) except dbconstants.TRANSIENT_CASSANDRA_ERRORS: message = 'Exception during batch_get_entity' logger.exception(message) raise AppScaleDBConnectionError(message) results.extend(list(batch_results)) results_dict = {row_key: {} for row_key in row_keys} for (key, column, value) in results: if key not in results_dict: results_dict[key] = {} results_dict[key][column] = value raise gen.Return(results_dict)