Beispiel #1
0
class TestGraphSetWithValidDataMerging(TestCase):
    def setUp(self):
        resource_a1 = Resource(resource_id="123",
                               type_name="test:a",
                               links=[SimpleLink(pred="has-foo", obj="goo")])
        resource_a2 = Resource(resource_id="123",
                               type_name="test:a",
                               links=[SimpleLink(pred="has-goo", obj="foo")])
        resource_b1 = Resource(
            resource_id="abc",
            type_name="test:b",
            links=[ResourceLinkLink(pred="has-a", obj="123")])
        resource_b2 = Resource(resource_id="def",
                               type_name="test:b",
                               links=[SimpleLink(pred="name", obj="sue")])
        resources = [resource_a1, resource_a2, resource_b1, resource_b2]
        self.graph_set = GraphSet(
            name="test-name",
            version="1",
            start_time=1234,
            end_time=4567,
            resources=resources,
            errors=["test err 1", "test err 2"],
            stats=MultilevelCounter(),
        )

    def test_rdf_a_type(self):
        graph = self.graph_set.to_rdf()

        a_results = graph.query(
            "select ?p ?o where {?s a <test-name:test:a> ; ?p ?o} order by ?p ?o"
        )
        expected_a_result_tuples = [
            ("http://www.w3.org/1999/02/22-rdf-syntax-ns#type",
             "test-name:test:a"),
            ("test-name:has-foo", "goo"),
            ("test-name:has-goo", "foo"),
            ("test-name:id", "123"),
        ]
        a_result_tuples = []
        for a_result in a_results:
            self.assertEqual(2, len(a_result))
            a_result_tuples.append((str(a_result[0]), str(a_result[1])))
        self.assertEqual(expected_a_result_tuples, a_result_tuples)

    def test_validate(self):
        self.graph_set.validate()
Beispiel #2
0
 def test_orphaned_ref(self):
     resource_a1 = Resource(resource_id="123",
                            type_name="test:a",
                            links=[SimpleLink(pred="has-foo", obj="goo")])
     resource_b1 = Resource(
         resource_id="abc",
         type_name="test:b",
         links=[ResourceLinkLink(pred="has-a", obj="456")])
     resources = [resource_a1, resource_b1]
     graph_set = GraphSet(
         name="test-name",
         version="1",
         start_time=1234,
         end_time=4567,
         resources=resources,
         errors=["test err 1", "test err 2"],
         stats=MultilevelCounter(),
     )
     with self.assertRaises(GraphSetOrphanedReferencesException):
         graph_set.validate()
Beispiel #3
0
class TestGraphSetWithValidDataNoMerging(TestCase):
    def setUp(self):
        resource_a1 = Resource(resource_id="123",
                               type_name="test:a",
                               links=[SimpleLink(pred="has-foo", obj="goo")])
        resource_a2 = Resource(resource_id="456", type_name="test:a")
        resource_b1 = Resource(
            resource_id="abc",
            type_name="test:b",
            links=[ResourceLinkLink(pred="has-a", obj="123")])
        resource_b2 = Resource(resource_id="def",
                               type_name="test:b",
                               links=[SimpleLink(pred="name", obj="sue")])
        resources = [resource_a1, resource_a2, resource_b1, resource_b2]
        self.graph_set = GraphSet(
            name="test-name",
            version="1",
            start_time=1234,
            end_time=4567,
            resources=resources,
            errors=["test err 1", "test err 2"],
            stats=MultilevelCounter(),
        )

    def test_rdf_a_type(self):
        graph = self.graph_set.to_rdf()

        a_results = graph.query(
            "select ?p ?o where {?s a <test-name:test:a> ; ?p ?o} order by ?p ?o"
        )
        expected_a_result_tuples = [
            ("http://www.w3.org/1999/02/22-rdf-syntax-ns#type",
             "test-name:test:a"),
            ("http://www.w3.org/1999/02/22-rdf-syntax-ns#type",
             "test-name:test:a"),
            ("test-name:has-foo", "goo"),
            ("test-name:id", "123"),
            ("test-name:id", "456"),
        ]
        a_result_tuples = []
        for a_result in a_results:
            self.assertEqual(2, len(a_result))
            a_result_tuples.append((str(a_result[0]), str(a_result[1])))
        self.assertEqual(expected_a_result_tuples, a_result_tuples)

    def test_rdf_b_type(self):
        graph = self.graph_set.to_rdf()
        graph.serialize("/tmp/test.rdf")
        linked_a_node_results = graph.query(
            "select ?s where {?s a <test-name:test:a>; <test-name:id> '123' }")
        self.assertEqual(len(linked_a_node_results), 1)
        for linked_a_node_result in linked_a_node_results:
            linked_a_node = str(linked_a_node_result[0])
        b_results = graph.query(
            "select ?p ?o where {?s a <test-name:test:b> ; ?p ?o} order by ?p ?o"
        )
        expected_b_result_tuples = [
            ("http://www.w3.org/1999/02/22-rdf-syntax-ns#type",
             "test-name:test:b"),
            ("http://www.w3.org/1999/02/22-rdf-syntax-ns#type",
             "test-name:test:b"),
            ("test-name:has-a", str(linked_a_node)),
            ("test-name:id", "abc"),
            ("test-name:id", "def"),
            ("test-name:name", "sue"),
        ]
        b_result_tuples = []
        for b_result in b_results:
            self.assertEqual(2, len(b_result))
            b_result_tuples.append((str(b_result[0]), str(b_result[1])))
        self.assertEqual(expected_b_result_tuples, b_result_tuples)

    def test_rdf_error_graphing(self):
        graph = self.graph_set.to_rdf()

        err_results = graph.query(
            "select ?o where { ?s <test-name:error> ?o } order by ?o")
        err_strs = []
        expected_err_strs = ["test err 1", "test err 2"]
        for err_result in err_results:
            self.assertEqual(1, len(err_result))
            err_strs.append(str(err_result[0]))
        self.assertEqual(err_strs, expected_err_strs)

    def test_to_dict(self):
        expected_dict = {
            "name": "test-name",
            "version": "1",
            "start_time": 1234,
            "end_time": 4567,
            "resources": {
                "123": {
                    "type": "test:a",
                    "links": [{
                        "pred": "has-foo",
                        "obj": "goo",
                        "type": "simple"
                    }],
                },
                "456": {
                    "type": "test:a"
                },
                "abc": {
                    "type":
                    "test:b",
                    "links": [{
                        "pred": "has-a",
                        "obj": "123",
                        "type": "resource_link"
                    }],
                },
                "def": {
                    "type": "test:b",
                    "links": [{
                        "pred": "name",
                        "obj": "sue",
                        "type": "simple"
                    }],
                },
            },
            "errors": ["test err 1", "test err 2"],
            "stats": {
                "count": 0
            },
        }
        self.assertDictEqual(expected_dict, self.graph_set.to_dict())

    def test_from_dict(self):
        input_dict = {
            "name": "test-name",
            "version": "1",
            "start_time": 1234,
            "end_time": 4567,
            "resources": {
                "123": {
                    "type": "test:a",
                    "links": [{
                        "pred": "has-foo",
                        "obj": "goo",
                        "type": "simple"
                    }],
                },
                "456": {
                    "type": "test:a"
                },
                "abc": {
                    "type":
                    "test:b",
                    "links": [{
                        "pred": "has-a",
                        "obj": "123",
                        "type": "resource_link"
                    }],
                },
                "def": {
                    "type": "test:b",
                    "links": [{
                        "pred": "name",
                        "obj": "sue",
                        "type": "simple"
                    }],
                },
            },
            "errors": ["test err 1", "test err 2"],
            "stats": {
                "count": 0
            },
        }
        graph_set = GraphSet.from_dict(input_dict)
        self.assertEqual(graph_set.to_dict(), input_dict)

    def test_validate(self):
        self.graph_set.validate()
Beispiel #4
0
    def scan(self) -> Dict[str, Any]:
        """Scan an account and return a dict containing keys:

            * account_id: str
            * output_artifact: str
            * api_call_stats: Dict[str, Any]
            * errors: List[str]

        If errors is non-empty the results are incomplete for this account.
        output_artifact is a pointer to the actual scan data - either on the local fs or in s3.

        To scan an account we create a set of GraphSpecs, one for each region.  Any ACCOUNT
        level granularity resources are only scanned in a single region (e.g. IAM Users)

        Returns:
            Dict of scan result, see above for details.
        """
        logger = Logger()
        with logger.bind(account_id=self.account_id):
            logger.info(event=AWSLogEvents.ScanAWSAccountStart)
            output_artifact = None
            stats = MultilevelCounter()
            errors: List[str] = []
            now = int(time.time())
            try:
                account_graph_set = GraphSet(
                    name=self.graph_name,
                    version=self.graph_version,
                    start_time=now,
                    end_time=now,
                    resources=[],
                    errors=[],
                    stats=stats,
                )
                # sanity check
                session = self.get_session()
                sts_client = session.client("sts")
                sts_account_id = sts_client.get_caller_identity()["Account"]
                if sts_account_id != self.account_id:
                    raise ValueError(
                        f"BUG: sts detected account_id {sts_account_id} != {self.account_id}"
                    )
                if self.regions:
                    scan_regions = tuple(self.regions)
                else:
                    scan_regions = get_all_enabled_regions(session=session)
                # build graph specs.
                # build a dict of regions -> services -> List[AWSResourceSpec]
                regions_services_resource_spec_classes: DefaultDict[
                    str,
                    DefaultDict[str,
                                List[Type[AWSResourceSpec]]]] = defaultdict(
                                    lambda: defaultdict(list))
                resource_spec_class: Type[AWSResourceSpec]
                for resource_spec_class in self.resource_spec_classes:
                    client_name = resource_spec_class.get_client_name()
                    resource_class_scan_granularity = resource_spec_class.scan_granularity
                    if resource_class_scan_granularity == ScanGranularity.ACCOUNT:
                        regions_services_resource_spec_classes[scan_regions[
                            0]][client_name].append(resource_spec_class)
                    elif resource_class_scan_granularity == ScanGranularity.REGION:
                        for region in scan_regions:
                            regions_services_resource_spec_classes[region][
                                client_name].append(resource_spec_class)
                    else:
                        raise NotImplementedError(
                            f"ScanGranularity {resource_class_scan_granularity} not implemented"
                        )
                with ThreadPoolExecutor(
                        max_workers=self.max_svc_threads) as executor:
                    futures = []
                    for (
                            region,
                            services_resource_spec_classes,
                    ) in regions_services_resource_spec_classes.items():
                        for (
                                service,
                                resource_spec_classes,
                        ) in services_resource_spec_classes.items():
                            region_session = self.get_session(region=region)
                            region_creds = region_session.get_credentials()
                            scan_future = schedule_scan_services(
                                executor=executor,
                                graph_name=self.graph_name,
                                graph_version=self.graph_version,
                                account_id=self.account_id,
                                region=region,
                                service=service,
                                access_key=region_creds.access_key,
                                secret_key=region_creds.secret_key,
                                token=region_creds.token,
                                resource_spec_classes=tuple(
                                    resource_spec_classes),
                            )
                            futures.append(scan_future)
                    for future in as_completed(futures):
                        graph_set_dict = future.result()
                        graph_set = GraphSet.from_dict(graph_set_dict)
                        errors += graph_set.errors
                        account_graph_set.merge(graph_set)
                    account_graph_set.validate()
            except Exception as ex:
                error_str = str(ex)
                trace_back = traceback.format_exc()
                logger.error(event=AWSLogEvents.ScanAWSAccountError,
                             error=error_str,
                             trace_back=trace_back)
                errors.append(" : ".join((error_str, trace_back)))
                unscanned_account_resource = UnscannedAccountResourceSpec.create_resource(
                    account_id=self.account_id, errors=errors)
                account_graph_set = GraphSet(
                    name=self.graph_name,
                    version=self.graph_version,
                    start_time=now,
                    end_time=now,
                    resources=[unscanned_account_resource],
                    errors=errors,
                    stats=stats,
                )
                account_graph_set.validate()
        output_artifact = self.artifact_writer.write_artifact(
            name=self.account_id, data=account_graph_set.to_dict())
        logger.info(event=AWSLogEvents.ScanAWSAccountEnd)
        api_call_stats = account_graph_set.stats.to_dict()
        return {
            "account_id": self.account_id,
            "output_artifact": output_artifact,
            "errors": errors,
            "api_call_stats": api_call_stats,
        }
Beispiel #5
0
 def scan(self) -> List[Dict[str, Any]]:
     logger = Logger()
     scan_result_dicts = []
     now = int(time.time())
     prescan_account_ids_errors: DefaultDict[str,
                                             List[str]] = defaultdict(list)
     futures = []
     with ThreadPoolExecutor(max_workers=self.max_threads) as executor:
         shuffled_account_ids = random.sample(
             self.account_scan_plan.account_ids,
             k=len(self.account_scan_plan.account_ids))
         for account_id in shuffled_account_ids:
             with logger.bind(account_id=account_id):
                 logger.info(event=AWSLogEvents.ScanAWSAccountStart)
                 try:
                     session = self.account_scan_plan.accessor.get_session(
                         account_id=account_id)
                     # sanity check
                     sts_client = session.client("sts")
                     sts_account_id = sts_client.get_caller_identity(
                     )["Account"]
                     if sts_account_id != account_id:
                         raise ValueError(
                             f"BUG: sts detected account_id {sts_account_id} != {account_id}"
                         )
                     if self.account_scan_plan.regions:
                         scan_regions = tuple(
                             self.account_scan_plan.regions)
                     else:
                         scan_regions = get_all_enabled_regions(
                             session=session)
                     account_gran_scan_region = random.choice(
                         self.preferred_account_scan_regions)
                     # build a dict of regions -> services -> List[AWSResourceSpec]
                     regions_services_resource_spec_classes: DefaultDict[
                         str, DefaultDict[
                             str,
                             List[Type[AWSResourceSpec]]]] = defaultdict(
                                 lambda: defaultdict(list))
                     resource_spec_class: Type[AWSResourceSpec]
                     for resource_spec_class in self.resource_spec_classes:
                         client_name = resource_spec_class.get_client_name()
                         if resource_spec_class.scan_granularity == ScanGranularity.ACCOUNT:
                             if resource_spec_class.region_whitelist:
                                 account_resource_scan_region = resource_spec_class.region_whitelist[
                                     0]
                             else:
                                 account_resource_scan_region = account_gran_scan_region
                             regions_services_resource_spec_classes[
                                 account_resource_scan_region][
                                     client_name].append(
                                         resource_spec_class)
                         elif resource_spec_class.scan_granularity == ScanGranularity.REGION:
                             if resource_spec_class.region_whitelist:
                                 resource_scan_regions = tuple(
                                     region for region in scan_regions
                                     if region in
                                     resource_spec_class.region_whitelist)
                                 if not resource_scan_regions:
                                     resource_scan_regions = resource_spec_class.region_whitelist
                             else:
                                 resource_scan_regions = scan_regions
                             for region in resource_scan_regions:
                                 regions_services_resource_spec_classes[
                                     region][client_name].append(
                                         resource_spec_class)
                         else:
                             raise NotImplementedError(
                                 f"ScanGranularity {resource_spec_class.scan_granularity} unimplemented"
                             )
                     # Build and submit ScanUnits
                     shuffed_regions_services_resource_spec_classes = random.sample(
                         regions_services_resource_spec_classes.items(),
                         len(regions_services_resource_spec_classes),
                     )
                     for (
                             region,
                             services_resource_spec_classes,
                     ) in shuffed_regions_services_resource_spec_classes:
                         region_session = self.account_scan_plan.accessor.get_session(
                             account_id=account_id, region_name=region)
                         region_creds = region_session.get_credentials()
                         shuffled_services_resource_spec_classes = random.sample(
                             services_resource_spec_classes.items(),
                             len(services_resource_spec_classes),
                         )
                         for (
                                 service,
                                 svc_resource_spec_classes,
                         ) in shuffled_services_resource_spec_classes:
                             future = schedule_scan(
                                 executor=executor,
                                 graph_name=self.graph_name,
                                 graph_version=self.graph_version,
                                 account_id=account_id,
                                 region_name=region,
                                 service=service,
                                 access_key=region_creds.access_key,
                                 secret_key=region_creds.secret_key,
                                 token=region_creds.token,
                                 resource_spec_classes=tuple(
                                     svc_resource_spec_classes),
                             )
                             futures.append(future)
                 except Exception as ex:
                     error_str = str(ex)
                     trace_back = traceback.format_exc()
                     logger.error(
                         event=AWSLogEvents.ScanAWSAccountError,
                         error=error_str,
                         trace_back=trace_back,
                     )
                     prescan_account_ids_errors[account_id].append(
                         f"{error_str}\n{trace_back}")
     account_ids_graph_set_dicts: Dict[str,
                                       List[Dict[str,
                                                 Any]]] = defaultdict(list)
     for future in as_completed(futures):
         account_id, graph_set_dict = future.result()
         account_ids_graph_set_dicts[account_id].append(graph_set_dict)
     # first make sure no account id appears both in account_ids_graph_set_dicts
     # and prescan_account_ids_errors - this should never happen
     doubled_accounts = set(
         account_ids_graph_set_dicts.keys()).intersection(
             set(prescan_account_ids_errors.keys()))
     if doubled_accounts:
         raise Exception((
             f"BUG: Account(s) {doubled_accounts} in both "
             "account_ids_graph_set_dicts and prescan_account_ids_errors."))
     # graph prescan error accounts
     for account_id, errors in prescan_account_ids_errors.items():
         with logger.bind(account_id=account_id):
             unscanned_account_resource = UnscannedAccountResourceSpec.create_resource(
                 account_id=account_id, errors=errors)
             account_graph_set = GraphSet(
                 name=self.graph_name,
                 version=self.graph_version,
                 start_time=now,
                 end_time=now,
                 resources=[unscanned_account_resource],
                 errors=errors,
                 stats=MultilevelCounter(),
             )
             account_graph_set.validate()
             output_artifact = self.artifact_writer.write_json(
                 name=account_id, data=account_graph_set.to_dict())
             logger.info(event=AWSLogEvents.ScanAWSAccountEnd)
             api_call_stats = account_graph_set.stats.to_dict()
             scan_result_dicts.append({
                 "account_id": account_id,
                 "output_artifact": output_artifact,
                 "errors": errors,
                 "api_call_stats": api_call_stats,
             })
     # graph rest
     for account_id, graph_set_dicts in account_ids_graph_set_dicts.items():
         with logger.bind(account_id=account_id):
             # if there are any errors whatsoever we generate an empty graph with
             # errors only
             errors = []
             for graph_set_dict in graph_set_dicts:
                 errors += graph_set_dict["errors"]
             if errors:
                 unscanned_account_resource = UnscannedAccountResourceSpec.create_resource(
                     account_id=account_id, errors=errors)
                 account_graph_set = GraphSet(
                     name=self.graph_name,
                     version=self.graph_version,
                     start_time=now,
                     end_time=now,
                     resources=[unscanned_account_resource],
                     errors=errors,
                     stats=MultilevelCounter(
                     ),  # ENHANCHMENT: could technically get partial stats.
                 )
                 account_graph_set.validate()
             else:
                 account_graph_set = GraphSet(
                     name=self.graph_name,
                     version=self.graph_version,
                     start_time=now,
                     end_time=now,
                     resources=[],
                     errors=[],
                     stats=MultilevelCounter(),
                 )
                 for graph_set_dict in graph_set_dicts:
                     graph_set = GraphSet.from_dict(graph_set_dict)
                     account_graph_set.merge(graph_set)
             output_artifact = self.artifact_writer.write_json(
                 name=account_id, data=account_graph_set.to_dict())
             logger.info(event=AWSLogEvents.ScanAWSAccountEnd)
             api_call_stats = account_graph_set.stats.to_dict()
             scan_result_dicts.append({
                 "account_id": account_id,
                 "output_artifact": output_artifact,
                 "errors": errors,
                 "api_call_stats": api_call_stats,
             })
     return scan_result_dicts