def test_simple_equals_expressions(): DATA = DictSet(TEST_DATA, storage_class=STORAGE_CLASS.MEMORY) assert DATA.filter("name == 'James Potter'").count() == 2 assert DATA.filter("age == 10").count() == 1 assert DATA.filter("alive == true").count() == 3 assert DATA.filter("affiliations is none").count() == 4 assert DATA.filter("name like '%Potter%'").count() == 4
def test_items(): data = [ { "key": 1, "value": "one", "plus1": 2 }, { "key": 2, "value": "two", "plus1": 3 }, { "key": 3, "value": "three", "plus1": 4 }, { "key": 4, "value": "four", "plus1": 5 }, ] ds = DictSet(data, storage_class=STORAGE_CLASS.DISK) items = list([i.as_dict() for i in ds.get_items(0, 2)]) assert items == [ { "key": 1, "value": "one", "plus1": 2 }, { "key": 3, "value": "three", "plus1": 4 }, ], items
def test_simple_compound_expressions(): DATA = DictSet(TEST_DATA, storage_class=STORAGE_CLASS.MEMORY) assert DATA.filter("name like '%Potter' and alive == true").count() == 2 assert DATA.filter("name like '%Potter' or alive == true").count() == 5
def test_take(): data = [ { "key": 1, "value": "one", "plus1": 2 }, { "key": 2, "value": "two", "plus1": 3 }, { "key": 3, "value": "three", "plus1": 4 }, { "key": 4, "value": "four", "plus1": 5 }, ] ds = DictSet(data, storage_class=STORAGE_CLASS.MEMORY) assert ds.take(1).collect_list().pop() == { "key": 1, "value": "one", "plus1": 2, }, ds.take(1).collect_list() assert ds.take(2).count() == 2, ds.take(2).count() # we had a problem where spendable dictsets were taking n+1 records, but only returning n burnable = DictSet((a for a in ds.collect_list()), storage_class=STORAGE_CLASS.NO_PERSISTANCE) print(burnable.take(3)) l = len(list(burnable.take(10))) assert l == 1, l
def SqlReader(sql_statement: str, **kwargs): """ Use basic SQL queries to filter Reader. Parameters: sql_statement: string kwargs: parameters to pass to the Reader Note: `select` is taken from SQL SELECT `dataset` is taken from SQL FROM `filters` is taken from SQL WHERE """ # some imports here to remove cyclic imports from mabel import DictSet, Reader sql = SqlParser(sql_statement) get_logger().info(repr(sql)) actual_select = sql.select_expression if sql.select_expression is None: actual_select = "*" elif sql.select_expression != "*": actual_select = sql.select_expression + ", *" reducer = None if sql.select_expression == "COUNT(*)": reducer = lambda x: {"*": "*"} # FROM clause # WHERE clause if isinstance(sql.dataset, list): # it's a list if it's been parsed into a SQL statement, # this is how subqueries are interpretted - the parser # doesn't extract a dataset name - it collects parts of # a SQL statement which it can then pass to a SqlReader # to get back a dataset - which we then use as the # dataset for the outer query. reader = SqlReader("".join(sql.dataset), **kwargs) else: reader = Reader( select=actual_select, dataset=sql.dataset, filters=sql.where_expression, **kwargs, ) # GROUP BY clause if sql.group_by or any( [t["type"] == TOKENS.AGGREGATOR for t in sql.select_evaluator.tokens] # type:ignore ): from ...internals.group_by import GroupBy # convert the clause into something we can pass to GroupBy if sql.group_by: groups = [ group.strip() for group in sql.group_by.split(",") if group.strip() != "" ] else: groups = ["*"] # we're not really grouping aggregations = [] renames = [] for t in sql.select_evaluator.tokens: # type:ignore if t["type"] == TOKENS.AGGREGATOR: aggregations.append((t["value"], t["parameters"][0]["value"])) if t["as"]: t["raw"] = get_function_name(t) renames.append(t) elif t["type"] == TOKENS.VARIABLE and t["value"] not in groups: raise InvalidSqlError( "Invalid SQL - SELECT clause in a statement with a GROUP BY clause must be made of aggregations or items from the GROUP BY clause." ) if aggregations: grouped = GroupBy(reader, groups).aggregate(aggregations) else: grouped = GroupBy(reader, groups).groups() # there could be 250000 groups, so we're not going to load them into memory reader = DictSet(grouped) # HAVING clause # if we have a HAVING clause, filter the grouped data by it if sql.having: reader = reader.filter(sql.having) # SELECT clause renames = {} # type:ignore for t in sql.select_evaluator.tokens: # type:ignore if t["as"]: renames[get_function_name(t)] = t["as"] def _perform_renames(row): for k, v in [(k, v) for k, v in row.items()]: if k in renames: row[renames[k]] = row.pop(k, row.get(renames[k])) return row if renames: reader = DictSet(map(_perform_renames, reader)) reader = reader.select(sql.select_evaluator.fields()) # type:ignore # distinct now we have only the columns we're interested in if sql.distinct: reader = reader.distinct() # ORDER BY clause if sql.order_by: take = 10000 # the Query UI is currently set to 2000 if sql.limit: take = int(sql.limit) reader = DictSet( reader.sort_and_take(column=sql.order_by, take=take, descending=sql.order_descending)) # LIMIT clause if sql.limit: reader = reader.take(sql.limit) return reader
yield from lines if carry_forward: yield carry_forward def en(ls): for l in ls: yield orjson.loads(l) lines = list(en(read_file_clear("tests/data/formats/jsonl/tweets.jsonl"))) print(len(lines)) os.makedirs("_temp", exist_ok=True) with Timer("MEMORY"): t = DictSet(lines, storage_class=STORAGE_CLASS.MEMORY) for i, r in enumerate(t): pass print(i) with Timer("DISK WRITE"): t = DictSet(lines, storage_class=STORAGE_CLASS.DISK) print(t.count()) with Timer("DISK READ"): for i, r in enumerate(t): pass print(i) with Timer("COMBINED"): t = DictSet(lines, storage_class=STORAGE_CLASS.DISK)