class abc_base_2(ABC): @abstractmethod async def doit_async_1(self): raise NotImplementedError() @abstractmethod async def doit_async_2(self): raise NotImplementedError() doit_1 = make_sync(doit_async_1) doit_2 = make_sync(doit_async_2)
def test_wrap_kwargs_specified_loop(): t_wrap = make_sync(func_with_kwargs) async def doit(): assert t_wrap(bins=5, range=(5, 10)) == 10 loop = get_event_loop() loop.run_until_complete(doit())
def test_wrap_with_running_loop(): t_wrap = make_sync(simple_func) async def doit(): assert t_wrap(5) == 6 loop = get_event_loop() loop.run_until_complete(doit())
class tester: def __init__(self, b: int): self._b = b async def my_async(self, a: int) -> int: await sleep(0.01) return a + self._b my = make_sync(my_async)
async def test_client_session_different_threads(): async def get_a_client_async(): return await default_client_session() from make_it_sync import make_sync get_a_client = make_sync(get_a_client_async) c1 = get_a_client() c2 = await get_a_client_async() assert c1 is not c2
def test_wrap_normal(): t_wrap = make_sync(simple_func) assert t_wrap(4) == 5
def get_playlist(self, channel_id: str, use_cache: bool = True) -> Union[str, None]: return make_sync(self.async_client.get_playlist)(channel_id=channel_id, use_cache=use_cache)
def get_segment(self, path: str) -> Union[bytes, None]: return make_sync(self.async_client.get_segment)(path=path)
class abc_base(ABC): @abstractmethod async def doit_async(self): raise NotImplementedError() doit = make_sync(doit_async)
def test_wrap_docstring(): s_orig = simple_func.__doc__ s_new = make_sync(simple_func).__doc__ assert s_orig == s_new
def get_configuration(self) -> Optional[Dict[str, Any]]: return make_sync(self.async_client.get_configuration)()
def test_wrap_kwargs_default(): t_wrap = make_sync(simple_kwargs) assert t_wrap(1) == 11
def get_now_playing(self, channel: XMChannel) -> Union[Dict[str, Any], None]: return make_sync(self.async_client.get_now_playing)(channel)
from typing import Tuple from dataframe_expressions import DataFrame import hep_tables from make_it_sync import make_sync import matplotlib.pyplot as plt import numpy as np import hl_tables.local as local async def histogram_async(df: DataFrame, bins: int = 10, range: Tuple[float, float] = (0, 1)): hist_data = hep_tables.histogram(df, bins=bins, range=range) h, bins = await local.make_local_async(hist_data) f, ax = plt.subplots() ax.fill_between(bins, np.r_[h, h[-1]], step='post') histogram = make_sync(histogram_async)
def get_channels(self) -> List[dict]: return make_sync(self.async_client.get_channels)()
class ObjectStream: r''' The objects can be events, jets, electrons, or just floats, or arrays of floats. `ObjectStream` holds onto the AST that will produce this stream of objects. The chain of `ObjectStream` objects, linked together, is a DAG that stores the user's intent. Every stream has an _object type_. This is the type of the elements of the stream. For example, the top stream, the objects are of type `Event` (or `xADOEvent`). If you transform an `Event` into a list of jets, then the object type will be a list of `Jet` objects. Each element of the stream is an array. You can also lift this second array of `Jets` and turn it into a plain stream of `Jets` using the `SelectMany` method below. In that case, you'll no longer be able to tell the boundary between events. ''' def __init__(self, the_ast: ast.AST): r""" Initialize the stream with the ast that will produce this stream of objects. The user will almost never use this initializer. """ self._q_ast = the_ast @property def query_ast(self) -> ast.AST: '''Return the query `ast` that this `ObjectStream` represents Returns: ast.AST: The python `ast` that is represented by this query ''' return self._q_ast def SelectMany(self, func: Union[str, ast.Lambda, Callable]) -> 'ObjectStream': r""" Given the current stream's object type is an array or other iterable, return the items in this objects type, one-by-one. This has the effect of flattening a nested array. Arguments: func: The function that should be applied to this stream's objects to return an iterable. Each item of the iterable is now the stream of objects. Returns: A new ObjectStream of the type of the elements. Notes: - The function can be a `lambda`, the name of a one-line function, a string that contains a lambda definition, or a python `ast` of type `ast.Lambda`. """ return ObjectStream( function_call( "SelectMany", [self._q_ast, cast(ast.AST, parse_as_ast(func))])) def Select(self, f: Union[str, ast.Lambda, Callable]) -> 'ObjectStream': r""" Apply a transformation function to each object in the stream, yielding a new type of object. There is a one-to-one correspondence between the input objects and output objects. Arguments: f: selection function (lambda) Returns: A new ObjectStream of the transformed elements. Notes: - The function can be a `lambda`, the name of a one-line function, a string that contains a lambda definition, or a python `ast` of type `ast.Lambda`. """ return ObjectStream( function_call( "Select", [self._q_ast, cast(ast.AST, parse_as_ast(f))])) def Where(self, filter: Union[str, ast.Lambda, Callable]) -> 'ObjectStream': r''' Filter the object stream, allowing only items for which `filter` evaluates as true through. Arguments: filter A filter lambda that returns True/False. Returns: A new ObjectStream that contains only elements that pass the filter function Notes: - The function can be a `lambda`, the name of a one-line function, a string that contains a lambda definition, or a python `ast` of type `ast.Lambda`. ''' return ObjectStream( function_call( "Where", [self._q_ast, cast(ast.AST, parse_as_ast(filter))])) def AsPandasDF(self, columns=[]) -> 'ObjectStream': r""" Return a pandas stream that contains one item, an pandas `DataFrame`. This `DataFrame` will contain all the data fed to it. Only non-array datatypes are permitted: the data must look like an Excel table. Arguments: columns Array of names of the columns. Will default to "col0", "call1", etc. Exception will be thrown if the number of columns do not match. """ # To get Pandas use the ResultPandasDF function call. return ObjectStream( function_call("ResultPandasDF", [self._q_ast, as_ast(columns)])) def AsROOTTTree(self, filename, treename, columns=[]) -> 'ObjectStream': r""" Return the sequence of items as a ROOT TTree. Each item in the ObjectStream will get one entry in the file. The items must be of types that the infrastructure can work with: Float A tree with a single float in each entry will be written. vector<float> A tree with a list of floats in each entry will be written. (<tuple>) A tree with multiple items (leaves) will be written. Each leaf must have one of the above types. Nested tuples are not supported. Arguments: filename Name of the file in which a TTree of the objects will be written. treename Name of the tree to be written to the file columns Array of names of the columns. This must match the number of items in a tuple to be written out. Returns: A new ObjectStream with type [(filename, treename)]. This is because multiple tree's may be written by the back end, and need to be concatenated together to get the full dataset. The order of the files back is consistent for different queries on the same dataset. """ return ObjectStream( function_call("ResultTTree", [ self._q_ast, as_ast(columns), as_ast(treename), as_ast(filename) ])) def AsParquetFiles(self, filename: str, columns: Union[str, List[str]] = []) -> 'ObjectStream': '''Returns the sequence of items as a `parquet` file. Each item in the ObjectStream gets a separate entry in the file. The times must be of types that the infrastructure can work with: Float A tree with a single float in each entry will be written. vector<float> A tree with a list of floats in each entry will be written. (<tuple>) A tree with multiple items (leaves) will be written. Each leaf must have one of the above types. Nested tuples are not supported. {k:v, } A dictionary with named columns. v is either a float or a vector of floats. Arguments: filename Name of a file in which the data will be written. Depending on where the data comes from this may not be used - consider it a suggestion. columns If the data does not arrive by dictionary, then these are the column names. Returns: A new `ObjectStream` with type `[filename]`. This is because multiple files may be written by the backend - the data should be concatinated together to get a final result. The order of the files back is consistent for different queries on the same dataset. ''' return ObjectStream( function_call( "ResultParquet", [self._q_ast, as_ast(columns), as_ast(filename)])) def AsAwkwardArray(self, columns=[]) -> 'ObjectStream': r''' Return a pandas stream that contains one item, an `awkward` array, or dictionary of `awkward` arrays. This `awkward` will contain all the data fed to it. Arguments: columns Array of names of the columns. Will default to "col0", "call1", etc. Exception will be thrown if the number of columns do not match. Returns: An `ObjectStream` with the `awkward` array data as its one and only element. ''' return ObjectStream( function_call("ResultAwkwardArray", [self._q_ast, as_ast(columns)])) def _get_executor(self, executor: Callable[[ast.AST], Awaitable[Any]] = None) \ -> Callable[[ast.AST], Awaitable[Any]]: r''' Returns an executor that can be used to run this. Logic seperated out as it is used from several different places. Arguments: executor Callback to run the AST. Can be synchronous or coroutine. Returns: An executor that is either synchronous or a coroutine. ''' if executor is not None: return executor from .event_dataset import find_ed_in_ast ed = find_ed_in_ast(self._q_ast) return ed.execute_result_async async def value_async(self, executor: Callable[[ast.AST], Any] = None) -> Any: r''' Evaluate the ObjectStream computation graph. Tracks back to the source dataset to understand how to evaluate the AST. It is possible to pass in an executor to override that behavior (used mostly for testing). Arguments: executor A function that when called with the ast will return a future for the result. If None, then uses the default executor. Normally is none and the default executor specified by the `EventDatasource` is called instead. Returns The first element of the ObjectStream after evaluation. Note This is the non-blocking version - it will return a future which can be `await`ed upon until the query is done. ''' # Fetch the executor exe = self._get_executor(executor) # Run it return await exe(self._q_ast) value = make_sync(value_async)
from dataframe_expressions import DataFrame from make_it_sync import make_sync async def count_async(df: DataFrame) -> int: ''' Given a dataframe, it will return an int at the outter most level. And run everything too, and return it. ''' from hl_tables.local import make_local_async return await make_local_async(df.Count(axis=0)) count = make_sync(count_async)
def close_session(self): return make_sync(self.async_client.close_session)()
class abc_base(ABC): @abstractmethod async def doit_async(self, a1: int, a2: int = 20, a3: int = 30): raise NotImplementedError() doit = make_sync(doit_async)
def test_wrap_no_await(): t_wrap = make_sync(simple_no_wait) assert t_wrap(1) == 2
def test_wrap_with_loop(): t_wrap = make_sync(simple_func) _ = get_event_loop() assert t_wrap(4) == 5
def get_channel(self, name: str) -> Union[XMChannel, None]: return make_sync(self.async_client.get_channel)(name)
def test_wrap_exception(): t_wrap = make_sync(simple_raise) with pytest.raises(Exception) as e: t_wrap(5) assert "hi there" in str(e.value)
from typing import Any, List from dataframe_expressions import DataFrame from make_it_sync import make_sync from .runner import runner, result from .servicex.xaod_runner import xaod_runner from .awkward.awkward_runner import awkward_runner runners: List[runner] = [xaod_runner(), awkward_runner()] async def make_local_async(df: DataFrame) -> Any: ''' Get the data from the remote system that is represented by `df` and get it here, locally, on this computer. ''' modified_df = df for r in runners: modified_df = await r.process(modified_df) if isinstance(modified_df, result): break if not isinstance(modified_df, result): raise Exception('Unable to process data frame!') return modified_df.result make_local = make_sync(make_local_async)
def test_wrap_kwargs_given(): t_wrap = make_sync(simple_kwargs) assert t_wrap(1, me=11) == 12
class ServiceXABC(ABC): ''' Abstract base class for accessing the ServiceX front-end for a particular dataset. This does have some implementations, but not a full set (hence why it isn't an ABC). A light weight, mostly immutable, base class that holds basic configuration information for use with ServiceX file access, including the dataset name. Subclasses implement the various access methods. Note that not all methods may be accessible! ''' def __init__( self, dataset: str, image: Optional[str] = None, max_workers: int = 20, status_callback_factory: Optional[ StatusUpdateFactory] = _run_default_wrapper, ): ''' Create and configure a ServiceX object for a dataset. Arguments dataset Name of a dataset from which queries will be selected. image Name of transformer image to use to transform the data. If None the default implementation is used. cache_adaptor Runs the caching for data and queries that are sent up and down. max_workers Maximum number of transformers to run simultaneously on ServiceX. cache_path Path to the cache status_callback_factory Factory to create a status notification callback for each query. One is created per query. Notes: - The `status_callback` argument, by default, uses the `tqdm` library to render progress bars in a terminal window or a graphic in a Jupyter notebook (with proper jupyter extensions installed). If `status_callback` is specified as None, no updates will be rendered. A custom callback function can also be specified which takes `(total_files, transformed, downloaded, skipped)` as an argument. The `total_files` parameter may be `None` until the system knows how many files need to be processed (and some files can even be completed before that is known). ''' self._dataset = dataset self._image = image self._max_workers = max_workers # We can't create the notifier until the actual query, # so only need to save the status update. self._status_callback_factory = \ status_callback_factory if status_callback_factory is not None \ else _null_progress_feedback def _create_notifier(self, downloading: bool) -> _status_update_wrapper: 'Internal method to create a updater from the status call-back' return _status_update_wrapper( self._status_callback_factory(self._dataset, downloading)) @abstractmethod async def get_data_rootfiles_async(self, selection_query: str) -> List[Path]: ''' Fetch query data from ServiceX matching `selection_query` and return it as a list of root files. The files are uniquely ordered (the same query will always return the same order). Arguments: selection_query The `qastle` string specifying the data to be queried Returns: root_files The list of root files ''' @abstractmethod async def get_data_pandas_df_async(self, selection_query: str) -> pd.DataFrame: ''' Fetch query data from ServiceX matching `selection_query` and return it as a pandas dataframe. The data is uniquely ordered (the same query will always return the same order). Arguments: selection_query The `qastle` string specifying the data to be queried Returns: df The pandas dataframe Exceptions: xxx If the data is not the correct shape (e.g. a flat, rectangular table). ''' @abstractmethod async def get_data_awkward_async(self, selection_query: str) \ -> Dict[bytes, ak.Array]: ''' Fetch query data from ServiceX matching `selection_query` and return it as dictionary of awkward arrays, an entry for each column. The data is uniquely ordered (the same query will always return the same order). Arguments: selection_query The `qastle` string specifying the data to be queried Returns: a Dictionary of jagged arrays (as needed), one for each column. The dictionary keys are `bytes` to support possible unicode characters. ''' @abstractmethod async def get_data_parquet_async(self, selection_query: str) -> List[Path]: ''' Fetch query data from ServiceX matching `selection_query` and return it as a list of parquet files. The files are uniquely ordered (the same query will always return the same order). Arguments: selection_query The `qastle` string specifying the data to be queried Returns: root_files The list of parquet files ''' # Define the synchronous versions of the async methods for easy of use get_data_rootfiles = make_sync(get_data_rootfiles_async) get_data_pandas_df = make_sync(get_data_pandas_df_async) get_data_awkward = make_sync(get_data_awkward_async) get_data_parquet = make_sync(get_data_parquet_async)
def test_wrap_signature(): s_orig = inspect.signature(simple_func) t_wrap = make_sync(simple_func) s_new = inspect.signature(t_wrap) assert str(s_orig) == str(s_new)
def test_wrap_kwargs_defaults(): wrap_it = make_sync(func_with_kwargs) assert wrap_it() == 14
def test_wrap_kwargs_specified(): wrap_it = make_sync(func_with_kwargs) assert wrap_it(bins=5, range=(5, 10)) == 10
def authenticate(self) -> bool: return make_sync(self.async_client.authenticate)()