def pair_partition_to_vec(input_data: Tuple[Dict, Tuple[str,str], Iterable[Union[PosNegExample, UnlabeledExample, np.ndarray]]]): processed_specs, fields, partiton_data = input_data columns = get_nested_index(fields) dfs = [] for example in partiton_data: Encoding.encoding_cnt = 0 # hack to get named tuples to work in parallel if isinstance(example, np.ndarray): example = PosNegExample(*example) # use numbers because we odn't know the names here neg_feature_vec = count_violations_memoized(processed_specs, Task(example.data, Query.from_vegalite(example[4]), example.task)) pos_feature_vec = count_violations_memoized(processed_specs, Task(example.data, Query.from_vegalite(example[5]), example.task)) # Reformat the json data so that we can insert it into a multi index data frame. # https://stackoverflow.com/questions/24988131/nested-dictionary-to-multiindex-dataframe-where-dictionary-keys-are-column-label specs = {(fields[0], key): values for key, values in neg_feature_vec.items()} specs.update({(fields[1], key): values for key, values in pos_feature_vec.items()}) specs[('source', '')] = example.source specs[('task', '')] = example.task dfs.append(pd.DataFrame(specs, columns=columns, index=[example.pair_id])) return pd.concat(dfs)
def __mutate_spec(self, base_spec: Spec, props: List[str], prop_index: int, seen: Set[Spec], specs: List[Spec]): # base case if (prop_index == len(props)): self.model.post_improve(base_spec, props) base_spec['data'] = {'url': self.data_url} # within a group, don't repeat the same specs if not (base_spec in seen): seen.add(base_spec) query = Query.from_vegalite(base_spec) if (is_valid(Task(self.data, query))): specs.append(base_spec) # recursive case else: prop_to_mutate = props[prop_index] for enum in self.model.get_enums(prop_to_mutate): spec = deepcopy(base_spec) self.model.mutate_prop(spec, prop_to_mutate, enum) # recursive call self.__mutate_spec(spec, props, prop_index + 1, seen, specs) return
def test_stack_agg(self): query = Query.from_vegalite({ 'mark': 'bar', 'encoding': { 'x': { 'type': 'nominal', 'field': 'n1', }, 'y': { 'type': 'quantitative', 'field': 'q1', 'stack': 'zero', 'aggregate': 'sum' }, 'detail': { 'type': 'nominal', 'field': 'n2' }, 'color': { 'type': 'quantitative', 'field': 'q2', 'aggregate': 'mean' } } }) assert is_valid(Task(data, query), True) == True
def test_is_valid(): data = Data(fields=[Field('foo', 'number')]) invalid = Query.from_vegalite({ 'mark': 'text', 'encoding': { 'x': {'field': 'foo', 'type': 'quantitative'} } }) assert is_valid(Task(data, invalid)) == False valid = Query.from_vegalite({ 'mark': 'point', 'encoding': { 'x': {'field': 'foo', 'type': 'quantitative'} } }) assert is_valid(Task(data, valid)) == True
def test_one_bar(self): query = Query.from_vegalite({ 'mark': 'bar', 'encoding': { 'y': { 'type': 'quantitative', 'field': 'q1' } } }) assert is_valid(Task(data, query), True) == True
def test_row_only(self): query = Query.from_vegalite({ 'mark': 'point', 'encoding': { 'row': { 'type': 'nominal', 'field': 'n1' } } }) assert is_valid(Task(data, query), True) == False
def test_only_one_agg(self): query = Query.from_vegalite({ 'mark': 'point', 'encoding': { 'x': { 'type': 'quantitative', 'field': 'q1' }, 'y': { 'type': 'quantitative', 'field': 'q2', 'aggregate': 'mean' } } }) assert is_valid(Task(data, query), True) == False
def test_heatmap(self): query = Query.from_vegalite({ 'mark': 'rect', 'encoding': { 'x': { 'type': 'nominal', 'field': 'n1', }, 'y': { 'type': 'ordinal', 'field': 'q1', 'bin': True } } }) assert is_valid(Task(data, query), True) == True
def test_hist(self): query = Query.from_vegalite({ 'mark': 'bar', 'encoding': { 'x': { 'type': 'quantitative', 'field': 'q1', 'bin': True }, 'y': { 'type': 'quantitative', 'aggregate': 'count' } } }) assert is_valid(Task(data, query), True) == True
def test_no_auto_bin(self): q = Query.from_vegalite({ 'mark': 'bar', 'encoding': { 'x': { 'type': 'nominal' }, "y": { 'type': 'quantitative', 'aggregate': 'mean' }, 'color': { 'type': 'nominal' } } }) encs = [e for e in q.encodings if e.stack] assert len(encs) == 0
def test_count_violations(): data = Data.from_csv('examples/data/cars.csv') query_json = { 'mark': 'bar', 'data': { 'url': 'data/cars.csv' }, 'encoding': { 'x': { 'field': 'origin', 'type': 'ordinal' }, 'y': { 'field': 'horsepower', 'type': 'quantitative', 'aggregate': 'mean' } } } violations = count_violations(Task(data, Query.from_vegalite(query_json))) assert 'encoding' in violations.keys() assert violations.get('encoding') == 2
def test_scatter(self): query = Query.from_vegalite({ 'mark': 'point', 'encoding': { 'x': { 'type': 'quantitative', 'field': 'q1', }, 'y': { 'type': 'quantitative', 'field': 'q2' }, 'color': { 'type': 'nominal', 'field': 'n2' }, 'size': { 'type': 'quantitative', 'field': 'q3' } } }) assert is_valid(Task(data, query), True) == True
def test_stack_q_q(self): query = Query.from_vegalite({ 'mark': 'area', 'encoding': { 'x': { 'type': 'quantitative', 'field': 'q1', 'scale': { 'zero': False } }, 'y': { 'type': 'quantitative', 'field': 'q2', 'stack': 'zero' }, 'color': { 'type': 'nominal', 'field': 'n1' } } }) assert is_valid(Task(data, query), True) == True
def run_spec(data, spec): query = Query.from_vegalite(spec) input_task = Task(data, query) return run(input_task)