def rate_urllist_on_moment(urllist: UrlList,
                           when: datetime = None,
                           prevent_duplicates: bool = True):
    # If there is no time slicing, then it's today.
    if not when:
        when = datetime.now(pytz.utc)

    log.info("Creating report for urllist %s on %s" % (
        urllist,
        when,
    ))

    if UrlListReport.objects.all().filter(urllist=urllist,
                                          at_when=when).exists():
        log.debug(
            "UrllistReport already exists for %s on %s. Not overwriting." %
            (urllist, when))
        return

    urls = relevant_urls_at_timepoint_urllist(urllist=urllist, when=when)
    all_url_ratings = get_latest_urlratings_fast(urls, when)
    calculation = aggegrate_url_rating_scores(
        all_url_ratings,
        only_include_issues=urllist_report_content[urllist.scan_type])

    try:
        last = UrlListReport.objects.filter(
            urllist=urllist, at_when__lte=when).latest('at_when')
    except UrlListReport.DoesNotExist:
        last = UrlListReport()  # create a dummy one for comparison

    calculation['name'] = urllist.name

    if prevent_duplicates:
        if not DeepDiff(last.calculation,
                        calculation,
                        ignore_order=True,
                        report_repetition=True):
            log.warning(
                "The report for %s on %s is the same as the report from %s. Not saving."
                % (urllist, when, last.at_when))
            return

    log.info(
        "The calculation for %s on %s has changed, so we're saving this rating."
        % (urllist, when))

    # remove urls and name from scores object, so it can be used as initialization parameters (saves lines)
    # this is by reference, meaning that the calculation will be affected if we don't work on a clone.
    init_scores = deepcopy(calculation)
    del (init_scores['name'])
    del (init_scores['urls'])

    report = UrlListReport(**init_scores)
    report.urllist = urllist
    report.at_when = when
    report.average_internet_nl_score = sum_internet_nl_scores_over_rating(
        calculation)
    report.calculation = calculation
    report.save()
Exemple #2
0
def test_report_upgrade(db, monkeypatch) -> None:
    # Create urllist with a lot of unscannable domains, only apple.com is scannable.
    # megaupload.com will never be scannable, and the rest can have an endpoint and might be in the report
    # already because of this (but without endpoints)

    urls = ['akamaihd.net', 'apple.com', 'bp.blogspot.com', 'clickbank.net', 'cocolog-nifty.com', 'fda.gov',
            'geocities.jp', 'ggpht.com', 'googleusercontent.com', 'megaupload.com', 'nhk.or.jp',
            'ssl-images-amazon.com', 'ytimg.com']

    # create the list, code from test domain management:
    account, created = Account.objects.all().get_or_create(name="test")
    urllist = UrlList()
    urllist.name = "upgrade"
    urllist.account = account
    urllist.save()

    scan = AccountInternetNLScan()
    scan.urllist = urllist
    scan.account = account
    scan.save()

    for url in urls:
        new_url = Url()
        new_url.url = url
        new_url.save()
        urllist.urls.add(new_url)
        urllist.save()

    # fake a report on these domains, without any upgrades, taken from the acc environment:
    fake_calculation = {
        "high": 19,
        "medium": 4,
        "low": 3,
        "ok": 15,
        "total_urls": 1,
        "high_urls": 1,
        "medium_urls": 0,
        "low_urls": 0,
        "ok_urls": 0,
        "explained_high": 0,
        "explained_medium": 0,
        "explained_low": 0,
        "explained_high_endpoints": 0,
        "explained_medium_endpoints": 0,
        "explained_low_endpoints": 0,
        "explained_high_urls": 0,
        "explained_medium_urls": 0,
        "explained_low_urls": 0,
        "explained_total_url_issues": 0,
        "explained_url_issues_high": 0,
        "explained_url_issues_medium": 0,
        "explained_url_issues_low": 0,
        "explained_total_endpoint_issues": 0,
        "explained_endpoint_issues_high": 0,
        "explained_endpoint_issues_medium": 0,
        "explained_endpoint_issues_low": 0,
        "total_endpoints": 1,
        "high_endpoints": 1,
        "medium_endpoints": 0,
        "low_endpoints": 0,
        "ok_endpoints": 0,
        "total_url_issues": 0,
        "total_endpoint_issues": 26,
        "url_issues_high": 0,
        "url_issues_medium": 0,
        "url_issues_low": 0,
        "endpoint_issues_high": 19,
        "endpoint_issues_medium": 4,
        "endpoint_issues_low": 3,
        "urls": [
            {
                "url": "apple.com",
                "ratings": [],
                "endpoints": [
                    {
                        "id": 4599,
                        "concat": "dns_a_aaaa/0 IPv0",
                        "ip": 0,
                        "ip_version": 0,
                        "port": 0,
                        "protocol": "dns_a_aaaa",
                        "v4": False,
                        "ratings": [
                            {
                                "type": "internet_nl_web_ipv6_ws_address",
                                "explanation": "Test internet_nl_web_ipv6_ws_address resulted in failed.",
                                "since": "2020-01-15T13:00:01.116013+00:00",
                                "last_scan": "2020-01-15T13:00:01.116689+00:00",
                                "high": 1,
                                "medium": 0,
                                "low": 0,
                                "ok": 0,
                                "not_testable": False,
                                "not_applicable": False,
                                "error_in_test": False,
                                "is_explained": False,
                                "comply_or_explain_explanation": "",
                                "comply_or_explain_explained_on": "",
                                "comply_or_explain_explanation_valid_until": "",
                                "comply_or_explain_valid_at_time_of_report": False,
                                "scan": 114575,
                                "scan_type": "internet_nl_web_ipv6_ws_address"
                            },
                            {
                                "type": "internet_nl_web_dnssec_valid",
                                "explanation": "Test internet_nl_web_dnssec_valid resulted in failed.",
                                "since": "2020-01-15T13:00:00.684906+00:00",
                                "last_scan": "2020-01-15T13:00:00.685193+00:00",
                                "high": 1,
                                "medium": 0,
                                "low": 0,
                                "ok": 0,
                                "not_testable": False,
                                "not_applicable": False,
                                "error_in_test": False,
                                "is_explained": False,
                                "comply_or_explain_explanation": "",
                                "comply_or_explain_explained_on": "",
                                "comply_or_explain_explanation_valid_until": "",
                                "comply_or_explain_valid_at_time_of_report": False,
                                "scan": 114556,
                                "scan_type": "internet_nl_web_dnssec_valid"
                            },
                        ],
                        "high": 19,
                        "medium": 4,
                        "low": 3,
                        "ok": 15,
                        "explained_high": 0,
                        "explained_medium": 0,
                        "explained_low": 0
                    }
                ],
                "total_issues": 26,
                "high": 19,
                "medium": 4,
                "low": 3,
                "ok": 15,
                "total_endpoints": 1,
                "high_endpoints": 1,
                "medium_endpoints": 0,
                "low_endpoints": 0,
                "ok_endpoints": 0,
                "total_url_issues": 0,
                "url_issues_high": 0,
                "url_issues_medium": 0,
                "url_issues_low": 0,
                "url_ok": 0,
                "total_endpoint_issues": 26,
                "endpoint_issues_high": 19,
                "endpoint_issues_medium": 4,
                "endpoint_issues_low": 3,
                "explained_total_issues": 0,
                "explained_high": 0,
                "explained_medium": 0,
                "explained_low": 0,
                "explained_high_endpoints": 0,
                "explained_medium_endpoints": 0,
                "explained_low_endpoints": 0,
                "explained_total_url_issues": 0,
                "explained_url_issues_high": 0,
                "explained_url_issues_medium": 0,
                "explained_url_issues_low": 0,
                "explained_total_endpoint_issues": 0,
                "explained_endpoint_issues_high": 0,
                "explained_endpoint_issues_medium": 0,
                "explained_endpoint_issues_low": 0
            }
        ],
        "total_issues": 26,
        "name": "Unscannable Web + one scannable"
    }

    fake_report = UrlListReport()
    fake_report.calculation = fake_calculation
    fake_report.urllist = urllist
    fake_report.at_when = timezone.now()
    fake_report.save()

    # First check if we are removing the comply_or_explain keys, mainly to save data:
    remove_comply_or_explain(fake_calculation)
    assert "explained_endpoint_issues_high" not in fake_calculation['urls'][0]
    assert "comply_or_explain_explanation" not in fake_calculation['urls'][0]['endpoints'][0]["ratings"][0]

    # Now add ratings based on keys, which makes direct access possible:
    add_keyed_ratings(fake_calculation)
    assert "ratings_by_type" in fake_calculation['urls'][0]['endpoints'][0]
    assert "internet_nl_web_ipv6_ws_address" in fake_calculation['urls'][0]['endpoints'][0]['ratings_by_type']

    # Add graph statistics, so the graphs can be instantly created based on report data
    add_statistics_over_ratings(fake_calculation)
    assert "statistics_per_issue_type" in fake_calculation
    assert "internet_nl_web_ipv6_ws_address" in fake_calculation["statistics_per_issue_type"]
    # todo: we can add some tests here to see if the aggregation is correct

    # add some statistics over all these metrics
    add_percentages_to_statistics(fake_calculation)

    assert "pct_ok" in fake_calculation["statistics_per_issue_type"]["internet_nl_web_ipv6_ws_address"]

    # and make sure the report is complete: meaning that all urls requested are present, even though they
    # could not be scanned. So a top 100 stays a top 100.
    assert (len(fake_calculation['urls']) == 1)
    upgrade_report_with_unscannable_urls(fake_report.id, scan.id)
    fake_report = UrlListReport.objects.all().first()
    assert(len(fake_report.calculation['urls']) == len(urls))

    # the first url should still be by apple:
    assert fake_report.calculation['urls'][0]['url'] == "apple.com"
def rate_urllist_on_moment(urllist: UrlList, when: datetime = None, prevent_duplicates: bool = True):
    """
    :param urllist:
    :param when: A moment in time of which data should be aggregated
    :param prevent_duplicates: If the last report had the same data, don't save a new report but return the last report
    instead.
    :return: UrlListReport
    """
    # If there is no time slicing, then it's today.
    if not when:
        when = datetime.now(pytz.utc)

    log.info("Creating report for urllist %s on %s" % (urllist, when, ))

    if UrlListReport.objects.all().filter(urllist=urllist, at_when=when).exists():
        log.debug("UrllistReport already exists for %s on %s. Not overwriting." % (urllist, when))
        existing_report = UrlListReport.objects.all().filter(urllist=urllist, at_when=when).first()
        return existing_report

    urls = relevant_urls_at_timepoint_urllist(urllist=urllist, when=when)
    all_url_ratings = get_latest_urlratings_fast(urls, when)

    # Clean the url_ratings to only include the content we need, only the content (being removed)
    # and only the endpoint types
    for urlrating in all_url_ratings:
        calculation = remove_issues_from_calculation(urlrating.calculation, urllist_report_content[urllist.scan_type])

        # Some endpoint types use the same ratings, such as dns_soa and dns_mx... This means that not
        # all endpoints will be removed for internet.nl. We need the following endpoints per scan:
        # -> note: urllist stores web/mail, they mean: web and mail_dashboard.
        endpoint_types_per_scan = {"web": "dns_a_aaaa", "mail": "dns_soa"}
        calculation = only_include_endpoint_protocols(calculation, [endpoint_types_per_scan[urllist.scan_type]])

        # This already overrides endpoint statistics, use the calculation you get from this.
        calculation, amount_of_issues = statistics_over_url_calculation(calculation)
        # overwrite the rest of the statistics.
        calculation = add_statistics_to_calculation(calculation, amount_of_issues)

        urlrating.calculation = calculation

    calculation = aggegrate_url_rating_scores(all_url_ratings)

    try:
        last = UrlListReport.objects.filter(urllist=urllist, at_when__lte=when).latest('at_when')
    except UrlListReport.DoesNotExist:
        last = UrlListReport()  # create a dummy one for comparison

    calculation['name'] = urllist.name

    if prevent_duplicates:
        if not DeepDiff(last.calculation, calculation, ignore_order=True, report_repetition=True):
            log.warning("The report for %s on %s is the same as the report from %s. Not saving." % (
                urllist, when, last.at_when))
            return last

    log.info("The calculation for %s on %s has changed, so we're saving this rating." % (urllist, when))

    # remove urls and name from scores object, so it can be used as initialization parameters (saves lines)
    # this is by reference, meaning that the calculation will be affected if we don't work on a clone.
    init_scores = deepcopy(calculation)
    del(init_scores['name'])
    del(init_scores['urls'])

    report = UrlListReport(**init_scores)
    report.urllist = urllist
    report.at_when = when
    report.average_internet_nl_score = sum_internet_nl_scores_over_rating(calculation)
    report.calculation = calculation
    report.save()
    return report
def rate_urllist_on_moment(urllist: UrlList,
                           when: datetime = None,
                           prevent_duplicates: bool = True,
                           scan_type: str = "web") -> int:
    """
    :param urllist:
    :param when: A moment in time of which data should be aggregated
    :param prevent_duplicates: If the last report had the same data, don't save a new report but return the last report
    instead.
    :return: UrlListReport id
    """
    # If there is no time slicing, then it's today.
    if not when:
        when = datetime.now(pytz.utc)

    log.info(f"Creating report for urllist {urllist} on {when}")

    if UrlListReport.objects.all().filter(urllist=urllist,
                                          at_when=when).exists():
        log.debug(
            f"UrllistReport already exists for {urllist} on {when}. Not overwriting."
        )
        existing_report = UrlListReport.objects.all().filter(
            urllist=urllist, at_when=when).first()
        return int(existing_report.id)

    urls = relevant_urls_at_timepoint_urllist(urllist=urllist, when=when)
    log.debug(f'Found {len(urls)} to be relevant at this moment.')

    calculation = create_calculation_on_urls(urls, when, scan_type=scan_type)

    try:
        last = UrlListReport.objects.filter(
            urllist=urllist, at_when__lte=when).latest('at_when')
    except UrlListReport.DoesNotExist:
        last = UrlListReport()  # create a dummy one for comparison

    calculation['name'] = urllist.name

    if prevent_duplicates:
        if not DeepDiff(last.calculation,
                        calculation,
                        ignore_order=True,
                        report_repetition=True):
            log.info(
                f"The report for {urllist} on {when} is the same as the report from {last.at_when}. Not saving."
            )
            return int(last.id)

    log.info(
        f"The calculation for {urllist} on {when} has changed, so we're saving this rating."
    )

    # remove urls and name from scores object, so it can be used as initialization parameters (saves lines)
    # this is by reference, meaning that the calculation will be affected if we don't work on a clone.
    init_scores = deepcopy(calculation)
    del init_scores['name']
    del init_scores['urls']

    external_scan_type = {
        "web": "web",
        "mail": "mail",
        "mail_dashboard": "mail"
    }
    report = UrlListReport(**init_scores)
    report.urllist = urllist
    report.report_type = external_scan_type[scan_type]
    report.at_when = when
    report.average_internet_nl_score = sum_internet_nl_scores_over_rating(
        calculation)
    report.calculation = calculation
    report.save()
    return int(report.id)