def test_parse_as_tree(self, *_): my_agg = Aggs(sample.EXPECTED_AGG_QUERY, mapping=MAPPING) response = Aggregations( data=sample.ES_AGG_RESPONSE, aggs=my_agg, index=None, client=None, query=None, ).to_tree() self.assertIsInstance(response, AggsResponseTree) self.assertEqual(response.__str__(), sample.EXPECTED_RESPONSE_TREE_REPR)
def test_normalize_buckets(self): my_agg = Aggs(sample.EXPECTED_AGG_QUERY, mapping=MAPPING) response = Aggregations( data=sample.ES_AGG_RESPONSE, aggs=my_agg, index=None, client=None, query=None, ).to_normalized() self.assertEqual( ordered(response), ordered(sample.EXPECTED_NORMALIZED_RESPONSE) )
def test_applied_nested_path_at_node(self): """ Check that correct nested path is detected at node levels: week └── nested_below_week └── local_metrics.field_class.name ├── avg_f1_score ├── max_f1_score └── min_f1_score """ node_hierarchy = DateHistogram( name="week", field="date", interval="1w", aggs=[ Terms( name="local_metrics.field_class.name", field="local_metrics.field_class.name", size=10, aggs=[ Min( name="min_f1_score", field="local_metrics.performance.test.f1_score", ) ], ) ], ) agg = Aggs(node_hierarchy, mapping=MAPPING, nested_autocorrect=True) self.assertEqual(agg.applied_nested_path_at_node("week"), None) for nid in ( "nested_below_week", "local_metrics.field_class.name", "min_f1_score", ): self.assertEqual(agg.applied_nested_path_at_node(nid), "local_metrics")
def test_parse_as_dataframe(self): my_agg = Aggs(sample.EXPECTED_AGG_QUERY, mapping=MAPPING) df = Aggregations( data=sample.ES_AGG_RESPONSE, aggs=my_agg, index=None, client=None, query=None, ).to_dataframe() self.assertIsInstance(df, pd.DataFrame) self.assertEqual( set(df.index.names), {"classification_type", "global_metrics.field.name"} ) self.assertEqual( set(df.columns), {"avg_f1_micro", "avg_nb_classes", "doc_count"} ) self.assertEqual( df.to_dict(orient="index"), { ("multiclass", "gpc"): { "avg_f1_micro": 0.93, "avg_nb_classes": 211.12, "doc_count": 198, }, ("multiclass", "kind"): { "avg_f1_micro": 0.89, "avg_nb_classes": 206.5, "doc_count": 370, }, ("multilabel", "gpc"): { "avg_f1_micro": 0.95, "avg_nb_classes": 183.21, "doc_count": 119, }, ("multilabel", "ispracticecompatible"): { "avg_f1_micro": 0.72, "avg_nb_classes": 18.71, "doc_count": 128, }, ("multilabel", "preservationmethods"): { "avg_f1_micro": 0.8, "avg_nb_classes": 9.97, "doc_count": 76, }, }, )
def test_response_tree(self, uuid_mock): uuid_mock.side_effect = range(1000) my_agg = Aggs(sample.EXPECTED_AGG_QUERY, mapping=MAPPING) response_tree = AggsResponseTree(aggs=my_agg, index=None).parse( sample.ES_AGG_RESPONSE) self.assertEqual(response_tree.__str__(), sample.EXPECTED_RESPONSE_TREE_REPR) self.assertEqual(len(response_tree.list()), 18) multilabel_gpc_bucket = next( (b for b in response_tree.list() if b.level == "global_metrics.field.name" and b.key == "gpc")) # bucket properties will give parents levels and keys self.assertEqual( response_tree.bucket_properties(multilabel_gpc_bucket), OrderedDict([ ("global_metrics.field.name", "gpc"), ("classification_type", "multilabel"), ]), )
def test_parse_as_tabular_multiple_roots(self): # with multiple aggs at root my_agg = Aggs( { "classification_type": {"terms": {"field": "classification_type"}}, "avg_f1_score": { "avg": {"field": "global_metrics.performance.test.micro.f1_score"} }, } ) raw_response = { "classification_type": { "doc_count_error_upper_bound": 0, "sum_other_doc_count": 0, "buckets": [ {"key": "multiclass", "doc_count": 439}, {"key": "multilabel", "doc_count": 433}, ], }, "avg_f1_score": {"value": 0.815}, } index_names, index_values = Aggregations( data=raw_response, aggs=my_agg, index=None, client=None, query=None, ).to_tabular(index_orient=True, expand_sep=" || ") self.assertEqual(index_names, []) self.assertEqual( index_values, { (): { "avg_f1_score": 0.815, "classification_type || multiclass": 439, "classification_type || multilabel": 433, } }, )
def test_client_bound_response(self, uuid_mock): uuid_mock.side_effect = range(1000) client_mock = Mock(spec=["search"]) my_agg = Aggs(sample.EXPECTED_AGG_QUERY, mapping=MAPPING) response_tree = AggsResponseTree(aggs=my_agg, index=None).parse( sample.ES_AGG_RESPONSE) response = IResponse( client=client_mock, tree=response_tree, index_name="some_index", depth=1, query={"term": { "some_field": 1 }}, ) # ensure that navigation to attributes works with autocompletion (dir is used in ipython) self.assertIn("classification_type_multiclass", dir(response)) self.assertIn("classification_type_multilabel", dir(response)) multilabel = response.classification_type_multilabel self.assertIsInstance(multilabel, IResponse) self.assertIs(multilabel._initial_tree, response._tree) self.assertIn("global_metrics_field_name_gpc", dir(multilabel)) gpc = multilabel.global_metrics_field_name_gpc self.assertIsInstance(gpc, IResponse) self.assertIs(gpc._initial_tree, response._tree) # test filter query used to list documents belonging to bucket self.assertTrue( equal_queries( gpc.get_bucket_filter(), { "bool": { "must": [ { "term": { "global_metrics.field.name": { "value": "gpc" } } }, { "term": { "classification_type": { "value": "multilabel" } } }, { "term": { "some_field": { "value": 1 } } }, ] } }, ))
def test_parse_as_tabular(self): # with single agg at root my_agg = Aggs(sample.EXPECTED_AGG_QUERY, mapping=MAPPING) index_names, index_values = Aggregations( data=sample.ES_AGG_RESPONSE, aggs=my_agg, index=None, client=None, query=None, ).to_tabular(index_orient=True) self.assertEqual( index_names, ["classification_type", "global_metrics.field.name"] ) self.assertEqual( index_values, { ("multilabel", "ispracticecompatible"): { "avg_f1_micro": 0.72, "avg_nb_classes": 18.71, "doc_count": 128, }, ("multilabel", "gpc"): { "avg_f1_micro": 0.95, "avg_nb_classes": 183.21, "doc_count": 119, }, ("multilabel", "preservationmethods"): { "avg_f1_micro": 0.8, "avg_nb_classes": 9.97, "doc_count": 76, }, ("multiclass", "kind"): { "avg_f1_micro": 0.89, "avg_nb_classes": 206.5, "doc_count": 370, }, ("multiclass", "gpc"): { "avg_f1_micro": 0.93, "avg_nb_classes": 211.12, "doc_count": 198, }, }, ) # index_orient = False index_names, index_values = Aggregations( data=sample.ES_AGG_RESPONSE, aggs=my_agg, index=None, client=None, query=None, ).to_tabular(index_orient=False) self.assertEqual( index_names, ["classification_type", "global_metrics.field.name"] ) self.assertEqual( index_values, [ { "avg_f1_micro": 0.72, "avg_nb_classes": 18.71, "classification_type": "multilabel", "doc_count": 128, "global_metrics.field.name": "ispracticecompatible", }, { "avg_f1_micro": 0.95, "avg_nb_classes": 183.21, "classification_type": "multilabel", "doc_count": 119, "global_metrics.field.name": "gpc", }, { "avg_f1_micro": 0.8, "avg_nb_classes": 9.97, "classification_type": "multilabel", "doc_count": 76, "global_metrics.field.name": "preservationmethods", }, { "avg_f1_micro": 0.89, "avg_nb_classes": 206.5, "classification_type": "multiclass", "doc_count": 370, "global_metrics.field.name": "kind", }, { "avg_f1_micro": 0.93, "avg_nb_classes": 211.12, "classification_type": "multiclass", "doc_count": 198, "global_metrics.field.name": "gpc", }, ], )
def test_add_node_with_mapping(self): with_mapping = Aggs(mapping=MAPPING, nested_autocorrect=True) self.assertEqual(len(with_mapping.list()), 0) # add regular node with_mapping = with_mapping.aggs(Terms("workflow", field="workflow")) self.assertEqual( with_mapping.to_dict(), {"workflow": {"terms": {"field": "workflow"}}} ) # try to add field aggregation on non-existing field will fail with self.assertRaises(AbsentMappingFieldError): with_mapping.aggs( Terms("imaginary_agg", field="imaginary_field"), insert_below="workflow", ) self.assertEqual(len(with_mapping.list()), 1) # try to add aggregation on a non-compatible field will fail with self.assertRaises(InvalidOperationMappingFieldError): with_mapping.aggs( Avg("average_of_string", field="classification_type"), insert_below="workflow", ) self.assertEqual(len(with_mapping.list()), 1) # add field aggregation on field passing through nested will automatically add nested with_mapping = with_mapping.aggs( Avg("local_f1_score", field="local_metrics.performance.test.f1_score"), insert_below="workflow", ) self.assertEqual( with_mapping.to_dict(), { "workflow": { "aggs": { "nested_below_workflow": { "aggs": { "local_f1_score": { "avg": { "field": "local_metrics.performance.test.f1_score" } } }, "nested": {"path": "local_metrics"}, } }, "terms": {"field": "workflow"}, } }, ) self.assertIn("nested_below_workflow", with_mapping) nested_node = with_mapping.get("nested_below_workflow") self.assertEqual(nested_node.KEY, "nested") self.assertEqual(nested_node.path, "local_metrics") # add other agg requiring nested will reuse nested agg as parent with_mapping = with_mapping.aggs( Avg("local_precision", field="local_metrics.performance.test.precision"), insert_below="workflow", ) self.assertEqual( with_mapping.to_dict(), { "workflow": { "aggs": { "nested_below_workflow": { "aggs": { "local_f1_score": { "avg": { "field": "local_metrics.performance.test.f1_score" } }, "local_precision": { "avg": { "field": "local_metrics.performance.test.precision" } }, }, "nested": {"path": "local_metrics"}, } }, "terms": {"field": "workflow"}, } }, ) self.assertEqual(len(with_mapping.list()), 4) # add under a nested parent a field aggregation that requires to be located under root will automatically # add reverse-nested with_mapping = with_mapping.aggs( Terms("language_terms", field="language"), insert_below="nested_below_workflow", ) self.assertEqual(len(with_mapping.list()), 6) self.assertEqual( with_mapping.to_dict(), { "workflow": { "aggs": { "nested_below_workflow": { "aggs": { "local_f1_score": { "avg": { "field": "local_metrics.performance.test.f1_score" } }, "local_precision": { "avg": { "field": "local_metrics.performance.test.precision" } }, "reverse_nested_below_nested_below_workflow": { "aggs": { "language_terms": { "terms": {"field": "language"} } }, "reverse_nested": {}, }, }, "nested": {"path": "local_metrics"}, } }, "terms": {"field": "workflow"}, } }, )
def test_groupby_args_syntax(self): a = Aggs().groupby("some_name", "terms", field="some_field") self.assertEqual(a.to_dict(), {"some_name": {"terms": {"field": "some_field"}}})
def test_groupby_insert_above(self): a1 = Aggs( Terms("A", field="A", aggs=[Terms("B", field="B"), Terms("C", field="C")]) ) self.assertEqual( a1.to_dict(), { "A": { "terms": {"field": "A"}, "aggs": { "B": {"terms": {"field": "B"}}, "C": {"terms": {"field": "C"}}, }, } }, ) self.assertEqual( a1.groupby(Terms("D", field="D"), insert_above="B").to_dict(), { "A": { "terms": {"field": "A"}, "aggs": { "C": {"terms": {"field": "C"}}, "D": { "terms": {"field": "D"}, "aggs": {"B": {"terms": {"field": "B"}}}, }, }, } }, ) self.assertEqual( a1.groupby( [Terms("D", field="D"), Terms("E", field="E")], insert_above="B" ).to_dict(), { "A": { "terms": {"field": "A"}, "aggs": { "C": {"terms": {"field": "C"}}, "D": { "terms": {"field": "D"}, "aggs": { "E": { "terms": {"field": "E"}, "aggs": {"B": {"terms": {"field": "B"}}}, } }, }, }, } }, ) self.assertEqual( a1.groupby( Terms("D", field="D", aggs=Terms("E", field="E")), insert_above="B" ).to_dict(), { "A": { "aggs": { "C": {"terms": {"field": "C"}}, "D": { "aggs": { "E": { "aggs": {"B": {"terms": {"field": "B"}}}, "terms": {"field": "E"}, } }, "terms": {"field": "D"}, }, }, "terms": {"field": "A"}, } }, ) # above root self.assertEqual( a1.groupby( Terms("D", field="D", aggs=Terms("E", field="E")), insert_above="A" ).to_dict(), { "D": { "terms": {"field": "D"}, "aggs": { "E": { "terms": {"field": "E"}, "aggs": { "A": { "terms": {"field": "A"}, "aggs": { "B": {"terms": {"field": "B"}}, "C": {"terms": {"field": "C"}}, }, } }, } }, } }, )
def test_init_from_node_hierarchy(self): node_hierarchy = sample.get_node_hierarchy() agg = Aggs(node_hierarchy, mapping=MAPPING) self.assertEqual(agg.to_dict(), sample.EXPECTED_AGG_QUERY) # with nested node_hierarchy = DateHistogram( name="week", field="date", interval="1w", aggs=[ Terms( name="local_metrics.field_class.name", field="local_metrics.field_class.name", size=10, aggs=[ Min( name="min_f1_score", field="local_metrics.performance.test.f1_score", ) ], ) ], ) agg = Aggs(node_hierarchy, mapping=MAPPING, nested_autocorrect=True) self.assertEqual( agg.to_dict(), { "week": { "aggs": { "nested_below_week": { "aggs": { "local_metrics.field_class.name": { "aggs": { "min_f1_score": { "min": { "field": "local_metrics.performance.test.f1_score" } } }, "terms": { "field": "local_metrics.field_class.name", "size": 10, }, } }, "nested": {"path": "local_metrics"}, } }, "date_histogram": {"field": "date", "interval": "1w"}, } }, ) self.assertEqual( agg.to_dict(), { "week": { "aggs": { "nested_below_week": { "aggs": { "local_metrics.field_class.name": { "aggs": { "min_f1_score": { "min": { "field": "local_metrics.performance.test.f1_score" } } }, "terms": { "field": "local_metrics.field_class.name", "size": 10, }, } }, "nested": {"path": "local_metrics"}, } }, "date_histogram": {"field": "date", "interval": "1w"}, } }, )
def test_validate_aggs_parent_id(self): """ <Aggregation> classification_type └── global_metrics.field.name ├── avg_f1_micro └── avg_nb_classes """ my_agg = Aggs(sample.EXPECTED_AGG_QUERY, mapping=MAPPING) with self.assertRaises(ValueError) as e: my_agg._validate_aggs_parent_id(pid=None) self.assertEqual( e.exception.args, ( "Declaration is ambiguous, you must declare the node id under which these " "aggregations should be placed.", ), ) with self.assertRaises(ValueError) as e: my_agg._validate_aggs_parent_id("avg_f1_micro") self.assertEqual( e.exception.args, ("Node id <avg_f1_micro> is not a bucket aggregation.",) ) self.assertEqual( my_agg._validate_aggs_parent_id("global_metrics.field.name"), "global_metrics.field.name", ) with self.assertRaises(NotFoundNodeError) as e: my_agg._validate_aggs_parent_id("non-existing-node") self.assertEqual( e.exception.args, ("Node id <non-existing-node> doesn't exist in tree",) ) # linear agg my_agg.drop_node("avg_f1_micro") my_agg.drop_node("avg_nb_classes") """ <Aggregation> classification_type └── global_metrics.field.name """ self.assertEqual( my_agg._validate_aggs_parent_id(None), "global_metrics.field.name" ) # empty agg agg = Aggs() self.assertEqual(agg._validate_aggs_parent_id(None), None)
def test_interpret_agg_string(self): some_agg = Aggs() some_agg = some_agg.aggs("some_field", insert_below=None) self.assertEqual( some_agg.to_dict(), {"some_field": {"terms": {"field": "some_field"}}} ) # with default size some_agg = Aggs() some_agg = some_agg.aggs("some_field", insert_below=None, size=10) self.assertEqual( some_agg.to_dict(), {"some_field": {"terms": {"field": "some_field", "size": 10}}}, ) # with parent some_agg = Aggs( {"root_agg_name": {"terms": {"field": "some_field", "size": 5}}} ) some_agg = some_agg.aggs("child_field", insert_below="root_agg_name") self.assertEqual( some_agg.to_dict(), { "root_agg_name": { "aggs": {"child_field": {"terms": {"field": "child_field"}}}, "terms": {"field": "some_field", "size": 5}, } }, ) # with required nested some_agg = Aggs( {"term_workflow": {"terms": {"field": "workflow", "size": 5}}}, mapping=MAPPING, nested_autocorrect=True, ) some_agg = some_agg.aggs( "local_metrics.field_class.name", insert_below="term_workflow" ) self.assertEqual( some_agg.to_dict(), { "term_workflow": { "aggs": { "nested_below_term_workflow": { "aggs": { "local_metrics.field_class.name": { "terms": {"field": "local_metrics.field_class.name"} } }, "nested": {"path": "local_metrics"}, } }, "terms": {"field": "workflow", "size": 5}, } }, )
def test_insert_tree_without_mapping(self): # with explicit nested initial_agg_1 = Aggs( { "week": { "date_histogram": { "field": "date", "format": "yyyy-MM-dd", "interval": "1w", } } }, ) self.assertEqual({n.identifier for n in initial_agg_1.list()}, {"week"}) pasted_agg_1 = Aggs( { "nested_below_week": { "nested": {"path": "local_metrics"}, "aggs": { "local_metrics.field_class.name": { "terms": { "field": "local_metrics.field_class.name", "size": 10, } } }, } } ) self.assertEqual( to_id_set(pasted_agg_1.list()), {"nested_below_week", "local_metrics.field_class.name"}, ) initial_agg_1.insert_tree(pasted_agg_1, "week") self.assertEqual( to_id_set(initial_agg_1.list()), {"week", "nested_below_week", "local_metrics.field_class.name"}, ) self.assertEqual( initial_agg_1.to_dict(), { "week": { "date_histogram": { "field": "date", "format": "yyyy-MM-dd", "interval": "1w", }, "aggs": { "nested_below_week": { "nested": {"path": "local_metrics"}, "aggs": { "local_metrics.field_class.name": { "terms": { "field": "local_metrics.field_class.name", "size": 10, } } }, } }, } }, )
def test_paste_tree_with_mapping(self): # with explicit nested initial_agg_1 = Aggs( { "week": { "date_histogram": { "field": "date", "format": "yyyy-MM-dd", "interval": "1w", } } }, mapping=MAPPING, ) self.assertEqual(to_id_set(initial_agg_1.list()), {"week"}) pasted_agg_1 = Aggs( { "nested_below_week": { "nested": {"path": "local_metrics"}, "aggs": { "local_metrics.field_class.name": { "terms": { "field": "local_metrics.field_class.name", "size": 10, } } }, } } ) self.assertEqual( to_id_set(pasted_agg_1.list()), {"nested_below_week", "local_metrics.field_class.name"}, ) initial_agg_1.insert_tree(pasted_agg_1, "week") self.assertEqual( to_id_set(initial_agg_1.list()), {"week", "nested_below_week", "local_metrics.field_class.name"}, ) self.assertEqual( initial_agg_1.to_dict(), { "week": { "date_histogram": { "field": "date", "format": "yyyy-MM-dd", "interval": "1w", }, "aggs": { "nested_below_week": { "nested": {"path": "local_metrics"}, "aggs": { "local_metrics.field_class.name": { "terms": { "field": "local_metrics.field_class.name", "size": 10, } } }, } }, } }, ) # without explicit nested initial_agg_2 = Aggs( { "week": { "date_histogram": { "field": "date", "format": "yyyy-MM-dd", "interval": "1w", } } }, mapping=MAPPING, nested_autocorrect=True, ) self.assertEqual(to_id_set(initial_agg_2.list()), {"week"}) pasted_agg_2 = Aggs( { "local_metrics.field_class.name": { "terms": {"field": "local_metrics.field_class.name", "size": 10} } } ) self.assertEqual( to_id_set(pasted_agg_2.list()), {"local_metrics.field_class.name"} ) initial_agg_2.insert_tree(pasted_agg_2, "week") self.assertEqual( to_id_set(initial_agg_2.list()), {"week", "nested_below_week", "local_metrics.field_class.name"}, ) self.assertEqual( initial_agg_2.to_dict(), { "week": { "date_histogram": { "field": "date", "format": "yyyy-MM-dd", "interval": "1w", }, "aggs": { "nested_below_week": { "nested": {"path": "local_metrics"}, "aggs": { "local_metrics.field_class.name": { "terms": { "field": "local_metrics.field_class.name", "size": 10, } } }, } }, } }, )