Example #1
0
def scrape_iso_table(db_session: Session, link: str, destination_folder: str, file_name: str, download: bool = True):
    html_file_path = os.path.join(destination_folder, file_name)

    # get_aws_html gets the HTML from AWS docs and stores it locally, then returns the path
    if download:
        if os.path.exists(html_file_path):
            os.remove(html_file_path)
        get_aws_html(link, html_file_path)

    raw_scraping_data = RawScrapingData()

    with open(html_file_path, "r") as f:
        soup = BeautifulSoup(f.read(), "html.parser")
        table = soup.find("tbody")
        rows = []
        for row in table.contents:
            if isinstance(row, Tag):
                rows.append(row)

        for row in rows:
            if isinstance(row, Tag):
                service_name = clean_service_name(str(row.contents[1].text))
                sdk = clean_service_name(str(row.contents[3].text))
                if sdk == "Namespaces*" or service_name == "AWS Services":
                    continue
                raw_scraping_data.add_entry_to_database(
                    db_session=db_session,
                    compliance_standard_name="ISO",
                    sdk=sdk,
                    service_name=clean_service_name(service_name),
                )
Example #2
0
def scrape_hipaa_table(db_session: Session,
                       link: str,
                       destination_folder: str,
                       file_name: str,
                       download: bool = True):
    html_file_path = os.path.join(destination_folder, file_name)
    if os.path.exists(html_file_path):
        os.remove(html_file_path)

    if download:
        if os.path.exists(html_file_path):
            os.remove(html_file_path)
        get_aws_html(link, html_file_path)

    raw_scraping_data = RawScrapingData()

    # These show up as list items but are not relevant at all
    false_positives = [
        "AWS Cloud Security",
        "AWS Management Console"
        "AWS CloudEndure"
        "Amazon CloudWatch SDK Metrics"
        "AWS Managed Services",
        "AWS Solutions Portfolio",
        "AWS Partner Network",
        "AWS Careers",
        "AWS Support Overview",
    ]
    service_names = []
    with open(html_file_path, "r") as f:
        soup = BeautifulSoup(f.read(), "html.parser")
        for tag in soup.find_all("li"):
            cleaned = clean_service_name(tag.text)
            if (cleaned.startswith("Amazon") or cleaned.startswith("AWS")
                    or cleaned.startswith("Elastic")
                    or cleaned.startswith("Alexa")):
                if cleaned not in false_positives:
                    service_names.append(cleaned)

    for service_name in service_names:
        raw_scraping_data.add_entry_to_database(
            db_session=db_session,
            compliance_standard_name="HIPAA",
            sdk=
            "",  # The HIPAA table does not list SDKs. We will update it to match in a second.
            service_name=clean_service_name(service_name),
        )
Example #3
0
def get_service_name(some_cells):
    service_name_cell = some_cells[0].contents[1]
    if isinstance(service_name_cell, NavigableString):
        service_name = str(service_name_cell)
    elif isinstance(service_name_cell, Tag):
        service_name = service_name_cell.text
    else:
        service_name = str(service_name_cell)
    service_name = clean_service_name(service_name)
    return service_name
Example #4
0
 def test_clean_service_name_tabs_and_newlines(self):
     # Make sure tabs and newlines are removed properly
     result = clean_service_name('\n\n\t\tAmazon API Gateway\t\n')
     self.assertEqual(result, "Amazon API Gateway")
     result = clean_service_name('Amazon API Gateway\n')
     self.assertTrue(result == "Amazon API Gateway")
Example #5
0
 def test_clean_service_name_non_breaking_spaces(self):
     result = clean_service_name('AWS Amplify\u00a0')
     self.assertEqual(result, "AWS Amplify")