def test_passing_overall_iam_action_override(self):
     """test_passing_overall_iam_action_override: Tests iam:CreateAccessKey
     (in overrides file as Permissions management, but in the AWS docs as Write)"""
     desired_result = "Permissions management"
     action_overrides = get_action_access_level_overrides_from_yml("iam")
     result = determine_access_level_override("iam", "CreateAccessKey",
                                              "Write", action_overrides)
     self.assertEqual(result, desired_result)
Example #2
0
def create_database(destination_directory, access_level_overrides_file):
    """
    Create the JSON Data source that holds the IAM data.

    :param destination_directory:
    :param access_level_overrides_file: The path to the file that we use for overriding access levels that are incorrect in the AWS documentation
    :return:
    """

    # Create the docs directory if it doesn't exist
    Path(os.path.join(destination_directory, "docs")).mkdir(parents=True,
                                                            exist_ok=True)

    # This holds the entire IAM definition
    schema = {}

    # for filename in ['list_amazonathena.partial.html']:
    file_list = []
    for filename in os.listdir(BUNDLED_HTML_DIRECTORY_PATH):
        if os.path.isfile(os.path.join(BUNDLED_HTML_DIRECTORY_PATH, filename)):
            if filename not in file_list:
                file_list.append(filename)

    file_list.sort()
    for filename in file_list:
        if not filename.startswith("list_"):
            continue

        with open(os.path.join(BUNDLED_HTML_DIRECTORY_PATH, filename),
                  "r") as f:
            soup = BeautifulSoup(f.read(), "html.parser")
            main_content = soup.find(id="main-content")
            if main_content is None:
                continue

            # Get service name
            title = main_content.find("h1", class_="topictitle").text
            title = re.sub(".*Actions, resources, and condition Keys for *",
                           "",
                           str(title),
                           flags=re.IGNORECASE)

            title = title.replace("</h1>", "")
            service_name = chomp(title)

            service_prefix = ""
            for c in main_content.find("h1",
                                       class_="topictitle").parent.children:
                if "prefix" in str(c):
                    service_prefix = str(c)
                    service_prefix = service_prefix.split(
                        '<code class="code">')[1]
                    service_prefix = chomp(service_prefix.split("</code>")[0])
                    break

            if service_prefix not in schema.keys():
                schema[service_prefix] = {}
                # The URL to that service's Actions, Resources, and Condition Keys page
                service_authorization_url_prefix = "https://docs.aws.amazon.com/service-authorization/latest/reference"
                service_authorization_url = f"{service_authorization_url_prefix}/{filename}"
                schema[service_prefix] = {
                    "service_name": service_name,
                    "prefix": service_prefix,
                    "service_authorization_url": service_authorization_url,
                    "privileges": {},
                    "resources": {},
                    "conditions": {},
                }

            access_level_overrides_cfg = get_action_access_level_overrides_from_yml(
                service_prefix, access_level_overrides_file)

            tables = main_content.find_all("div", class_="table-contents")

            for table in tables:
                # There can be 3 tables, the actions table, an ARN table, and a condition key table
                # Example: https://docs.aws.amazon.com/IAM/latest/UserGuide/list_awssecuritytokenservice.html
                if not header_matches("actions", table) or not header_matches(
                        "description", table):
                    continue

                rows = table.find_all("tr")
                row_number = 0
                while row_number < len(rows):
                    row = rows[row_number]

                    cells = row.find_all("td")
                    if len(cells) == 0:
                        # Skip the header row, which has th, not td cells
                        row_number += 1
                        continue

                    if len(cells) != 6:
                        # Sometimes the privilege contains Scenarios, and I don't know how to handle this
                        # raise Exception("Unexpected format in {}: {}".format(prefix, row))
                        break

                    # See if this cell spans multiple rows
                    rowspan = 1
                    if "rowspan" in cells[0].attrs:
                        rowspan = int(cells[0].attrs["rowspan"])

                    priv = ""
                    # Get the privilege
                    for link in cells[0].find_all("a"):
                        if "href" not in link.attrs:  # pylint: disable=no-else-continue
                            # Skip the <a id='...'> tags
                            api_documentation_link = None
                            continue
                        else:
                            api_documentation_link = link.attrs.get('href')
                            logger.debug(api_documentation_link)
                        priv = chomp(link.text)
                    if priv == "":
                        priv = chomp(cells[0].text)
                    action_name = priv
                    description = chomp(cells[1].text)
                    access_level = chomp(cells[2].text)
                    # Access Level #####
                    # access_level_overrides_cfg will only be true if the service in question is present
                    # in the overrides YML file
                    if access_level_overrides_cfg:
                        override_result = determine_access_level_override(
                            service_prefix,
                            action_name,
                            access_level,
                            access_level_overrides_cfg,
                        )
                        if override_result:
                            access_level = override_result
                            logger.debug(
                                "Override: Setting access level for %s:%s to %s",
                                service_prefix,
                                action_name,
                                access_level,
                            )
                    #     else:
                    #         access_level = access_level
                    # else:
                    #     access_level = access_level
                    resource_types = {}
                    resource_cell = 3

                    while rowspan > 0:
                        if len(cells) == 3 or len(cells) == 6:
                            # ec2:RunInstances contains a few "scenarios" which start in the
                            # description field, len(cells) is 5.
                            # I'm ignoring these as I don't know how to handle them.
                            # These include things like "EC2-Classic-InstanceStore" and
                            # "EC2-VPC-InstanceStore-Subnet"

                            resource_type = chomp(cells[resource_cell].text)

                            condition_keys_element = cells[resource_cell + 1]
                            condition_keys = []
                            if condition_keys_element.text != "":
                                for key_element in condition_keys_element.find_all(
                                        "p"):
                                    condition_keys.append(
                                        chomp(key_element.text))

                            dependent_actions_element = cells[resource_cell +
                                                              2]
                            dependent_actions = []
                            if dependent_actions_element.text != "":
                                for (
                                        action_element
                                ) in dependent_actions_element.find_all("p"):
                                    dependent_actions.append(
                                        chomp(action_element.text))
                            if "*" in resource_type:
                                required = True
                                resource_type = resource_type.strip("*")
                            else:
                                required = False

                            resource_types[resource_type] = {
                                "resource_type": resource_type,
                                "required": required,
                                "condition_keys": condition_keys,
                                "dependent_actions": dependent_actions,
                            }
                        rowspan -= 1
                        if rowspan > 0:
                            row_number += 1
                            resource_cell = 0
                            row = rows[row_number]
                            cells = row.find_all("td")

                    if "[permission only]" in priv:
                        priv = priv.split(" ")[0]

                    privilege_schema = {
                        "privilege": priv,
                        "description": description,
                        "access_level": access_level,
                        "resource_types": resource_types,
                        "api_documentation_link": api_documentation_link
                    }

                    schema[service_prefix]["privileges"][
                        priv] = privilege_schema
                    row_number += 1

            # Get resource table
            for table in tables:
                if not header_matches("resource types",
                                      table) or not header_matches(
                                          "arn", table):
                    continue

                rows = table.find_all("tr")
                for row in rows:
                    cells = row.find_all("td")

                    if len(cells) == 0:
                        # Skip the header row, which has th, not td cells
                        continue

                    if len(cells) != 3:
                        raise Exception(
                            "Unexpected number of resource cells {} in {}".
                            format(len(cells), filename))

                    resource = chomp(cells[0].text)

                    arn = no_white_space(cells[1].text)
                    conditions = []
                    for condition in cells[2].find_all("p"):
                        conditions.append(chomp(condition.text))

                    schema[service_prefix]["resources"][resource] = {
                        "resource": resource,
                        "arn": arn,
                        "condition_keys": conditions
                    }

            # Get condition keys table
            for table in tables:
                if not (header_matches("<th> condition keys </th>", table)
                        and header_matches("<th> type </th>", table)):
                    continue

                rows = table.find_all("tr")
                for row in rows:
                    cells = row.find_all("td")

                    if len(cells) == 0:
                        # Skip the header row, which has th, not td cells
                        continue

                    if len(cells) != 3:
                        raise Exception(
                            "Unexpected number of condition cells {} in {}".
                            format(len(cells), filename))

                    condition = no_white_space(cells[0].text)
                    description = chomp(cells[1].text)
                    value_type = chomp(cells[2].text)

                    schema[service_prefix]["conditions"][condition] = {
                        "condition": condition,
                        "description": description,
                        "type": value_type,
                    }
            # this_service_schema = {
            #     service_prefix: service_schema
            # }
            # schema.update(this_service_schema)

    iam_definition_file = os.path.join(destination_directory,
                                       "iam-definition.json")
    with open(iam_definition_file, "w") as file:
        json.dump(schema, file, indent=4)
    logger.info("Wrote IAM definition file to path: ", iam_definition_file)
Example #3
0
def create_database(destination_directory, access_level_overrides_file):
    """
    Create the JSON Data source that holds the IAM data.

    :param destination_directory:
    :param access_level_overrides_file: The path to the file that we use for overriding access levels that are incorrect in the AWS documentation
    :return:
    """

    # Create the docs directory if it doesn't exist
    Path(os.path.join(destination_directory, "data",
                      "docs")).mkdir(parents=True, exist_ok=True)

    schema = []

    # for filename in ['list_amazonathena.partial.html']:
    for filename in [
            f for f in os.listdir(BUNDLED_HTML_DIRECTORY_PATH)
            if os.path.isfile(os.path.join(BUNDLED_HTML_DIRECTORY_PATH, f))
    ]:
        if not filename.startswith("list_"):
            continue

        with open(os.path.join(BUNDLED_HTML_DIRECTORY_PATH, filename),
                  "r") as f:
            soup = BeautifulSoup(f.read(), "html.parser")
            main_content = soup.find(id="main-content")
            if main_content is None:
                continue

            # Get service name
            title = main_content.find("h1", class_="topictitle").text
            title = re.sub(".*Actions, Resources, and Condition Keys for *",
                           "", str(title))
            title = title.replace("</h1>", "")
            service_name = chomp(title)

            prefix = ""
            for c in main_content.find("h1",
                                       class_="topictitle").parent.children:
                if "prefix" in str(c):
                    prefix = str(c)
                    prefix = prefix.split('<code class="code">')[1]
                    prefix = chomp(prefix.split("</code>")[0])
                    break
            service_schema = {
                "service_name": service_name,
                "prefix": prefix,
                "privileges": [],
                "resources": [],
                "conditions": [],
            }

            access_level_overrides_cfg = get_action_access_level_overrides_from_yml(
                prefix, access_level_overrides_file)

            tables = main_content.find_all("div", class_="table-contents")

            for table in tables:
                # There can be 3 tables, the actions table, an ARN table, and a condition key table
                # Example: https://docs.aws.amazon.com/IAM/latest/UserGuide/list_awssecuritytokenservice.html
                if "<th> Actions </th>" not in [
                        chomp(str(x)) for x in table.find_all("th")
                ]:
                    continue

                rows = table.find_all("tr")
                row_number = 0
                while row_number < len(rows):
                    row = rows[row_number]

                    cells = row.find_all("td")
                    if len(cells) == 0:
                        # Skip the header row, which has th, not td cells
                        row_number += 1
                        continue

                    if len(cells) != 6:
                        # Sometimes the privilege might span multiple rows.
                        # Example: amazonroute53-DisassociateVPCFromHostedZone
                        # We should be handling this, but if we are not, then bail
                        raise Exception("Unexpected format in {}: {}".format(
                            prefix, row))

                    # See if this cell spans multiple rows
                    rowspan = 1
                    if "rowspan" in cells[0].attrs:
                        rowspan = int(cells[0].attrs["rowspan"])

                    priv = ""
                    # Get the privilege
                    for link in cells[0].find_all("a"):
                        if "href" not in link.attrs:
                            # Skip the <a id='...'> tags
                            continue
                        priv = chomp(link.text)
                    if priv == "":
                        priv = chomp(cells[0].text)
                    service_prefix = prefix
                    action_name = priv
                    description = chomp(cells[1].text)
                    access_level = chomp(cells[2].text)
                    # Access Level #####
                    # access_level_overrides_cfg will only be true if the service in question is present
                    # in the overrides YML file
                    if access_level_overrides_cfg:
                        override_result = determine_access_level_override(
                            service_prefix,
                            action_name,
                            access_level,
                            access_level_overrides_cfg,
                        )
                        if override_result:
                            access_level = override_result
                            logger.debug(
                                "Override: Setting access level for %s:%s to %s",
                                service_prefix,
                                action_name,
                                access_level,
                            )
                    #     else:
                    #         access_level = access_level
                    # else:
                    #     access_level = access_level
                    resource_types = []
                    resource_cell = 3

                    while rowspan > 0:
                        if len(cells) == 3 or len(cells) == 6:
                            # ec2:RunInstances contains a few "scenarios" which start in the
                            # description field, len(cells) is 5.
                            # I'm ignoring these as I don't know how to handle them.
                            # These include things like "EC2-Classic-InstanceStore" and
                            # "EC2-VPC-InstanceStore-Subnet"

                            resource_type = chomp(cells[resource_cell].text)

                            condition_keys_element = cells[resource_cell + 1]
                            condition_keys = []
                            if condition_keys_element.text != "":
                                for key_element in condition_keys_element.find_all(
                                        "p"):
                                    condition_keys.append(
                                        chomp(key_element.text))

                            dependent_actions_element = cells[resource_cell +
                                                              2]
                            dependent_actions = []
                            if dependent_actions_element.text != "":
                                for (
                                        action_element
                                ) in dependent_actions_element.find_all("p"):
                                    dependent_actions.append(
                                        chomp(action_element.text))
                            resource_types.append({
                                "resource_type":
                                resource_type,
                                "condition_keys":
                                condition_keys,
                                "dependent_actions":
                                dependent_actions,
                            })
                        rowspan -= 1
                        if rowspan > 0:
                            row_number += 1
                            resource_cell = 0
                            row = rows[row_number]
                            cells = row.find_all("td")

                    if "[permission only]" in priv:
                        priv = priv.split(" ")[0]

                    privilege_schema = {
                        "privilege": priv,
                        "description": description,
                        "access_level": access_level,
                        "resource_types": resource_types,
                    }

                    service_schema["privileges"].append(privilege_schema)
                    row_number += 1

            # Get resource table
            for table in tables:
                if "<th> Resource Types </th>" not in [
                        chomp(str(x)) for x in table.find_all("th")
                ]:
                    continue

                rows = table.find_all("tr")
                for row in rows:
                    cells = row.find_all("td")

                    if len(cells) == 0:
                        # Skip the header row, which has th, not td cells
                        continue

                    if len(cells) != 3:
                        raise Exception(
                            "Unexpected number of resource cells {} in {}".
                            format(len(cells), filename))

                    resource = chomp(cells[0].text)

                    arn = no_white_space(cells[1].text)
                    conditions = []
                    for condition in cells[2].find_all("p"):
                        conditions.append(chomp(condition.text))

                    service_schema["resources"].append({
                        "resource":
                        resource,
                        "arn":
                        arn,
                        "condition_keys":
                        conditions
                    })

            # Get condition keys table
            for table in tables:
                if "<th> Condition Keys </th>" not in [
                        chomp(str(x)) for x in table.find_all("th")
                ] or "<th> Type </th>" not in [
                        chomp(str(x)) for x in table.find_all("th")
                ]:
                    continue

                rows = table.find_all("tr")
                for row in rows:
                    cells = row.find_all("td")

                    if len(cells) == 0:
                        # Skip the header row, which has th, not td cells
                        continue

                    if len(cells) != 3:
                        raise Exception(
                            "Unexpected number of condition cells {} in {}".
                            format(len(cells), filename))

                    condition = no_white_space(cells[0].text)
                    description = chomp(cells[1].text)
                    value_type = chomp(cells[2].text)

                    service_schema["conditions"].append({
                        "condition": condition,
                        "description": description,
                        "type": value_type,
                    })
            schema.append(service_schema)

    schema.sort(key=lambda x: x["prefix"])
    iam_definition_file = os.path.join(destination_directory,
                                       "iam-definition.json")
    with open(iam_definition_file, "w") as file:
        json.dump(schema, file, indent=4)
    logger.info("Wrote IAM definition file to path: ", iam_definition_file)
Example #4
0
def build_action_table(db_session, service, access_level_overrides_file):
    """
    Builds the action table in the SQLite database.
    See the first Table on any service-specific page in the Actions, Resources, and Condition Keys documentation.
    That information is scraped, parsed, and stored in the SQLite database using this function.

    :param db_session: Database session object
    :param service: AWS Service to query. This can be called in a loop or for a single service (see connect_db function above).
    :param access_level_overrides_file: The path to the file that we use for overriding access levels that are incorrect in the AWS documentation
    """
    directory = os.path.abspath(os.path.dirname(__file__)) + '/data/docs/'
    html_list = get_html(directory, service)
    access_level_overrides_cfg = get_action_access_level_overrides_from_yml(
        service, access_level_overrides_file)
    for df_list in html_list:
        for df in df_list:  # pylint: disable=invalid-name
            table = json.loads(df.to_json(orient='split'))
            table_data = df
            # Actions table
            if 'Actions' in table_data and 'Access Level' in table_data:
                for i in range(len(table['data'])):
                    # If the table is set to none
                    # If the cell is blank, that indicates it needs wildcard
                    if table['data'][i][3] is None:
                        resource_type_name = 'None'
                        resource_type_name_append_wildcard = 'False'
                        resource_arn_format = '*'
                    # Check if resource type name has wildcard suffix - i.e., parameter* instead of parameter
                    # If it does, set the append_wildcard flag to true,
                    # and set the resource name to that but without the
                    # wildcard to make searching easier
                    elif '*' in table['data'][i][3]:
                        temp_resource_type_name = table['data'][i][3]
                        resource_type_name = temp_resource_type_name[:-1]
                        if resource_type_name is None:
                            resource_type_name = 'None'
                        resource_type_name_append_wildcard = 'True'
                        query_resource_arn_format = db_session.query(
                            ArnTable.raw_arn).filter(and_(ArnTable.service.ilike(service),
                                                          ArnTable.resource_type_name.like(resource_type_name)))
                        first_result = query_resource_arn_format.first()
                        try:
                            resource_arn_format = first_result.raw_arn
                        # For EC2 RunInstances, ResourceTypes have some duplicates.
                        # The Resource Types (*required) column has duplicates
                        # and the Access Level has `nan`
                        except AttributeError:
                            continue
                    else:
                        resource_type_name = table['data'][i][3]
                        resource_type_name_append_wildcard = 'False'
                        first_result = db_session.query(
                            ArnTable.raw_arn).filter(ArnTable.service.ilike(service),
                                                     ArnTable.resource_type_name.like(table['data'][i][3])).first()
                        try:
                            if '*' in first_result.raw_arn:
                                resource_arn_format = first_result.raw_arn[:-1]
                            else:
                                resource_arn_format = first_result.raw_arn
                        except AttributeError:
                            continue
                    # For lambda:InvokeFunction, the cell is 'lambda:InvokeFunction [permission only]'.
                    # To avoid this, let's test for a space in the name.
                    # If there is a space, remove the space and all text after
                    # it.
                    # pylint: disable=unused-variable
                    if ' ' in table['data'][i][0]:
                        text_with_space = table['data'][i][0]
                        action_name, sep, tail = text_with_space.partition(
                            ' ')
                    else:
                        action_name = table['data'][i][0]

                    # Access Level #####
                    # access_level_overrides_cfg will only be true if the service in question is present
                    # in the overrides YML file
                    if access_level_overrides_cfg:
                        override_result = determine_access_level_override(
                            service, str.lower(action_name), table['data'][i][2], access_level_overrides_cfg)
                        if override_result:
                            access_level = override_result
                            print(
                                f"Override: Setting access level for {service}:{action_name} to {access_level}")
                        else:
                            access_level = table['data'][i][2]
                    else:
                        access_level = table['data'][i][2]
                    # Condition keys #####
                    if table['data'][i][4] is None:
                        condition_keys = None
                    # If there are multiple condition keys, make them comma separated
                    # Otherwise, if we ingest them as-is, it will show up as
                    # two spaces
                    elif '  ' in table['data'][i][4]:
                        condition_keys = get_comma_separated_condition_keys(
                            table['data'][i][4])
                    else:
                        condition_keys = table['data'][i][4]

                    ##### Dependent actions #####
                    if table['data'][i][5] is None:
                        dependent_actions = None
                    elif '  ' in table['data'][i][5]:
                        # Let's just use the same method that we use for
                        # separating condition keys
                        dependent_actions = get_comma_separated_condition_keys(
                            table['data'][i][5])
                    else:
                        dependent_actions = table['data'][i][5]

                    db_session.add(ActionTable(
                        service=service,
                        name=str.lower(action_name),
                        description=table['data'][i][1],
                        access_level=access_level,
                        resource_type_name=resource_type_name,
                        resource_type_name_append_wildcard=resource_type_name_append_wildcard,
                        resource_arn_format=str(resource_arn_format),
                        condition_keys=condition_keys,
                        dependent_actions=dependent_actions
                    ))
                    db_session.commit()
            elif 'Resource Types' in table_data and 'ARN' in table_data:
                continue
            else:
                continue
    db_session.commit()