def _add_to_urls_to_urllist(account: Account, current_list: UrlList, urls: List[str]) -> Dict[str, Any]: counters: Dict[str, int] = {'added_to_list': 0, 'already_in_list': 0} for url in urls: # if already in list, don't need to save it again already_in_list = UrlList.objects.all().filter( account=account, id=current_list.id, urls__url__iexact=url).exists() if already_in_list: counters['already_in_list'] += 1 continue # if url already in database, we only need to add it to the list: existing_url = Url.objects.all().filter(url=url).first() if existing_url: current_list.urls.add(existing_url) counters['added_to_list'] += 1 else: new_url = Url.add(url) # always try to find a few dns endpoints... compose_discover_task(urls_filter={'pk': new_url.id}).apply_async() current_list.urls.add(new_url) counters['added_to_list'] += 1 return counters
def add_urls_to_organizations(organizations: List[Organization], urls: List[str]) -> None: for organization in organizations: for url in urls: # make the API easier to use: # will parse extensive urls: https://www.apple.com:80/yolo/swag extract = tldextract.extract(url) if extract.subdomain: url = f"{extract.subdomain}.{extract.domain}.{extract.suffix}" new_url = Url.add(url) new_url.organization.add(organization) if extract.domain: url = f"{extract.domain}.{extract.suffix}" new_url = Url.add(url) new_url.organization.add(organization)
def get_url(new_url_string: str): # first check if one exists, if not, create it. url = Url.objects.all().filter(url=new_url_string).first() if url: return url, False url = Url.add(new_url_string) return url, True
def accept(self, request, queryset): for urlsubmission in queryset: # don't add the same thing over and over, allows to re-select the ones already added without a problem # once rejected, can't be accepted via buttons: needs to be a manual action if urlsubmission.has_been_accepted or urlsubmission.has_been_rejected: continue # it's possible that the url already is in the system. If so, tie that to the submitted organization. # could be dead etc... (stacking?) url = Url.objects.all().filter(url=urlsubmission.url, is_dead=False).first() if not url: log.debug("adding new url: %s" % urlsubmission.url) # if it already exists, then add the url to the organization. url = Url(url=urlsubmission.url) url.save() # the organization is already inside the submission and should exist in most cases. url.organization.add(urlsubmission.for_organization) url.save() # add some tracking data to the submission urlsubmission.url_in_system = url urlsubmission.has_been_accepted = True urlsubmission.save() self.message_user(request, "Urls have been accepted and added to the system.")
def add_url(new_url_string: str): new_url = Url() new_url.url = new_url_string new_url.created_on = timezone.now() new_url.save() # always try to find a few dns endpoints... compose_discover_task(urls_filter={'pk': new_url.id}).apply_async() return new_url
def setup(): # Make a test organization + url test_type = OrganizationType.objects.all().filter(name="test").first() if not test_type: test_type = OrganizationType() test_type.name = "test" test_type.save() # Make sure the test organization can be scanned etc test_organization = Organization.objects.all().filter(name="test").first() if not test_organization: test_organization = Organization() test_organization.name = "test" test_organization.created_on = datetime.now(pytz.utc) test_organization.country = "NL" test_organization.internal_notes = "Created for testing purposes only." test_organization.type = test_type test_organization.save() test_url = Url.objects.all().filter(organization=test_organization).first() if not test_url: test_url = Url() test_url.url = "faalkaart.nl" test_url.save() test_url.organization.add(test_organization) test_url.save() # make sure the test organization can be scanned, doesn't have to be displayed on the map or something. configuration = Configuration.objects.all().filter( country="NL", organization_type=test_type).first() if not configuration: configuration = Configuration() configuration.organization_type = test_type configuration.country = "NL" configuration.is_reported = True configuration.is_scanned = True configuration.is_displayed = False configuration.is_the_default_option = False configuration.save() return test_type, test_organization, test_url, configuration
def clean_urls(urls: List[str]) -> Dict[str, List]: """ Incorrect urls are urls that are not following the uri scheme standard and don't have a recognizable suffix. They are returned for informational purposes and can contain utter garbage. The editor of the urls can then easily see if the urls are entered correctly and might correct some mistakes. :param urls: :return: """ result: Dict[str, List] = {'incorrect': [], 'correct': []} for url in urls: # all urls in the system must be lowercase (if applicable to used character) url = url.lower() if not Url.is_valid_url(url): result['incorrect'].append(url) else: result['correct'].append(url) return result
def add_urls(organization_id, urls: str): # todo: how does it behave with urls with protocol? # urls is basically garbage input on multiple lines with spaces and comma's and all kinds of unicode. # here we try to break up this garbage into small pieces text, some are a url, some are garbage... urls = urls.replace(",", " ") urls = urls.replace("\n", " ") urls = urls.split(" ") urls = [u.strip() for u in urls] not_valid = [] valid = [] for url in urls: if not Url.is_valid_url(url): not_valid.append(url) else: valid.append(url) if not Organization.objects.all().filter(id=organization_id).exists(): return operation_response(error=True, message="Organization could not be found.") if not valid: return operation_response(error=True, message="No valid url found.") organization = Organization.objects.all().filter( id=organization_id).first() for url in valid: organization.add_url(url) if not_valid: return operation_response( success=True, message=f"{len(valid)} urls have been added.", data={"invalid_domains": not_valid}) else: return operation_response( success=True, message=f"{len(valid)} urls have been added.")
def test_report_upgrade(db, monkeypatch) -> None: # Create urllist with a lot of unscannable domains, only apple.com is scannable. # megaupload.com will never be scannable, and the rest can have an endpoint and might be in the report # already because of this (but without endpoints) urls = ['akamaihd.net', 'apple.com', 'bp.blogspot.com', 'clickbank.net', 'cocolog-nifty.com', 'fda.gov', 'geocities.jp', 'ggpht.com', 'googleusercontent.com', 'megaupload.com', 'nhk.or.jp', 'ssl-images-amazon.com', 'ytimg.com'] # create the list, code from test domain management: account, created = Account.objects.all().get_or_create(name="test") urllist = UrlList() urllist.name = "upgrade" urllist.account = account urllist.save() scan = AccountInternetNLScan() scan.urllist = urllist scan.account = account scan.save() for url in urls: new_url = Url() new_url.url = url new_url.save() urllist.urls.add(new_url) urllist.save() # fake a report on these domains, without any upgrades, taken from the acc environment: fake_calculation = { "high": 19, "medium": 4, "low": 3, "ok": 15, "total_urls": 1, "high_urls": 1, "medium_urls": 0, "low_urls": 0, "ok_urls": 0, "explained_high": 0, "explained_medium": 0, "explained_low": 0, "explained_high_endpoints": 0, "explained_medium_endpoints": 0, "explained_low_endpoints": 0, "explained_high_urls": 0, "explained_medium_urls": 0, "explained_low_urls": 0, "explained_total_url_issues": 0, "explained_url_issues_high": 0, "explained_url_issues_medium": 0, "explained_url_issues_low": 0, "explained_total_endpoint_issues": 0, "explained_endpoint_issues_high": 0, "explained_endpoint_issues_medium": 0, "explained_endpoint_issues_low": 0, "total_endpoints": 1, "high_endpoints": 1, "medium_endpoints": 0, "low_endpoints": 0, "ok_endpoints": 0, "total_url_issues": 0, "total_endpoint_issues": 26, "url_issues_high": 0, "url_issues_medium": 0, "url_issues_low": 0, "endpoint_issues_high": 19, "endpoint_issues_medium": 4, "endpoint_issues_low": 3, "urls": [ { "url": "apple.com", "ratings": [], "endpoints": [ { "id": 4599, "concat": "dns_a_aaaa/0 IPv0", "ip": 0, "ip_version": 0, "port": 0, "protocol": "dns_a_aaaa", "v4": False, "ratings": [ { "type": "internet_nl_web_ipv6_ws_address", "explanation": "Test internet_nl_web_ipv6_ws_address resulted in failed.", "since": "2020-01-15T13:00:01.116013+00:00", "last_scan": "2020-01-15T13:00:01.116689+00:00", "high": 1, "medium": 0, "low": 0, "ok": 0, "not_testable": False, "not_applicable": False, "error_in_test": False, "is_explained": False, "comply_or_explain_explanation": "", "comply_or_explain_explained_on": "", "comply_or_explain_explanation_valid_until": "", "comply_or_explain_valid_at_time_of_report": False, "scan": 114575, "scan_type": "internet_nl_web_ipv6_ws_address" }, { "type": "internet_nl_web_dnssec_valid", "explanation": "Test internet_nl_web_dnssec_valid resulted in failed.", "since": "2020-01-15T13:00:00.684906+00:00", "last_scan": "2020-01-15T13:00:00.685193+00:00", "high": 1, "medium": 0, "low": 0, "ok": 0, "not_testable": False, "not_applicable": False, "error_in_test": False, "is_explained": False, "comply_or_explain_explanation": "", "comply_or_explain_explained_on": "", "comply_or_explain_explanation_valid_until": "", "comply_or_explain_valid_at_time_of_report": False, "scan": 114556, "scan_type": "internet_nl_web_dnssec_valid" }, ], "high": 19, "medium": 4, "low": 3, "ok": 15, "explained_high": 0, "explained_medium": 0, "explained_low": 0 } ], "total_issues": 26, "high": 19, "medium": 4, "low": 3, "ok": 15, "total_endpoints": 1, "high_endpoints": 1, "medium_endpoints": 0, "low_endpoints": 0, "ok_endpoints": 0, "total_url_issues": 0, "url_issues_high": 0, "url_issues_medium": 0, "url_issues_low": 0, "url_ok": 0, "total_endpoint_issues": 26, "endpoint_issues_high": 19, "endpoint_issues_medium": 4, "endpoint_issues_low": 3, "explained_total_issues": 0, "explained_high": 0, "explained_medium": 0, "explained_low": 0, "explained_high_endpoints": 0, "explained_medium_endpoints": 0, "explained_low_endpoints": 0, "explained_total_url_issues": 0, "explained_url_issues_high": 0, "explained_url_issues_medium": 0, "explained_url_issues_low": 0, "explained_total_endpoint_issues": 0, "explained_endpoint_issues_high": 0, "explained_endpoint_issues_medium": 0, "explained_endpoint_issues_low": 0 } ], "total_issues": 26, "name": "Unscannable Web + one scannable" } fake_report = UrlListReport() fake_report.calculation = fake_calculation fake_report.urllist = urllist fake_report.at_when = timezone.now() fake_report.save() # First check if we are removing the comply_or_explain keys, mainly to save data: remove_comply_or_explain(fake_calculation) assert "explained_endpoint_issues_high" not in fake_calculation['urls'][0] assert "comply_or_explain_explanation" not in fake_calculation['urls'][0]['endpoints'][0]["ratings"][0] # Now add ratings based on keys, which makes direct access possible: add_keyed_ratings(fake_calculation) assert "ratings_by_type" in fake_calculation['urls'][0]['endpoints'][0] assert "internet_nl_web_ipv6_ws_address" in fake_calculation['urls'][0]['endpoints'][0]['ratings_by_type'] # Add graph statistics, so the graphs can be instantly created based on report data add_statistics_over_ratings(fake_calculation) assert "statistics_per_issue_type" in fake_calculation assert "internet_nl_web_ipv6_ws_address" in fake_calculation["statistics_per_issue_type"] # todo: we can add some tests here to see if the aggregation is correct # add some statistics over all these metrics add_percentages_to_statistics(fake_calculation) assert "pct_ok" in fake_calculation["statistics_per_issue_type"]["internet_nl_web_ipv6_ws_address"] # and make sure the report is complete: meaning that all urls requested are present, even though they # could not be scanned. So a top 100 stays a top 100. assert (len(fake_calculation['urls']) == 1) upgrade_report_with_unscannable_urls(fake_report.id, scan.id) fake_report = UrlListReport.objects.all().first() assert(len(fake_report.calculation['urls']) == len(urls)) # the first url should still be by apple: assert fake_report.calculation['urls'][0]['url'] == "apple.com"
def alter_url_in_urllist(account, data) -> Dict[str, Any]: # data = {'list_id': list.id, 'url_id': url.id, 'new_url_string': url.url} expected_keys = ['list_id', 'url_id', 'new_url_string'] if check_keys(expected_keys, data): return operation_response(error=True, message="Missing keys in data.") # what was the old id we're changing? old_url = Url.objects.all().filter(pk=data['url_id']).first() if not old_url: return operation_response(error=True, message="The old url does not exist.") if old_url.url == data['new_url_string']: # no changes return operation_response(success=True, message="Saved.") # is this really a list? urllist = UrlList.objects.all().filter(account=account, pk=data['list_id']).first() if not urllist: return operation_response(error=True, message="List does not exist.") # is the url valid? if not Url.is_valid_url(data['new_url_string']): return operation_response( error=True, message="New url does not have the correct format.") # fetch the url, or create it if it doesn't exist. new_url, created = get_url(data['new_url_string']) # don't throw away the url, only from the list. (don't call delete, as it will delete the record) urllist.urls.remove(old_url) # Save after deletion, in case the same url is added it will not cause a foreign key error. urllist.save() urllist.urls.add(new_url) urllist.save() # somewhat inefficient to do 4 queries, yet, good enough old_url_has_mail_endpoint = Endpoint.objects.all().filter( url=old_url, is_dead=False, protocol='dns_soa').exists() old_url_has_web_endpoint = Endpoint.objects.all().filter( url=old_url, is_dead=False, protocol='dns_a_aaa').exists() if not created: new_url_has_mail_endpoint = Endpoint.objects.all().filter( url=new_url, is_dead=False, protocol='dns_soa').exists() new_url_has_web_endpoint = Endpoint.objects.all().filter( url=new_url, is_dead=False, protocol='dns_a_aaa').exists() else: new_url_has_mail_endpoint = 'unknown' new_url_has_web_endpoint = 'unknown' new_fragments = tldextract.extract(new_url.url) old_fragments = tldextract.extract(old_url.url) return operation_response(success=True, message="Saved.", data={ 'created': { 'id': new_url.id, 'url': new_url.url, 'created_on': new_url.created_on, 'has_mail_endpoint': new_url_has_mail_endpoint, 'has_web_endpoint': new_url_has_web_endpoint, 'subdomain': new_fragments.subdomain, 'domain': new_fragments.domain, 'suffix': new_fragments.suffix }, 'removed': { 'id': old_url.id, 'url': old_url.url, 'created_on': old_url.created_on, 'has_mail_endpoint': old_url_has_mail_endpoint, 'has_web_endpoint': old_url_has_web_endpoint, 'subdomain': old_fragments.subdomain, 'domain': old_fragments.domain, 'suffix': old_fragments.suffix }, })
def test_domain_upload(db, requests_mock, current_path): # the first answer is cached indefinitely. So the first request has to be correct. requests_mock.get( "https://publicsuffix.org/list/public_suffix_list.dat", text=text( f"{current_path}/websecmap/api/tests/public_suffix_list.dat"), ) u = Url() u.url = "arnhem.nl" u.save() # tldextract does not work correctly in tests. This is a workaround to make sure the computed fileds are set. # this is nonsense, since tldexcatct is also used in domain checks. This should work correctly. # u.computed_suffix = "nl" # u.computed_domain = "arnhem" # u.computed_subdomain = "" # super(Url, u).save() # make sure that the domain is fully present, and that things can be matched. # ExtractResult(subdomain='arnhem', domain='nl', suffix=''), suffix should be 'nl' new_url = Url.objects.all().get(url="arnhem.nl") assert new_url.computed_suffix == "nl" assert new_url.computed_domain == "arnhem" assert new_url.computed_subdomain == "" # mock all requests to arnhem.nl, act like they exist: requests_mock.get("", text="1") websecmap.scanners.scanner.http.resolves = lambda x: True # 0,arnhem.nl,www.ris.zeewolde.nl.,1 -> qname not matching 2ndlevel, results in qname domain, but no match in test # 334,zeewolde.nl,www.ris.arnhem.nl.,1 -> qname not matching 2ndlevel, results in qname domain, including match # Will be skipped as there is no new info: 335,arnhem.nl.,arnhem.nl.,1 csv_data = """,2ndlevel,qname,distinct_asns 123,arnhem.nl.,*.arnhem.nl.,1 124,arnhem.nl.,01.arnhem.nl.,1 163,arnhem.nl.,01daf671c183434584727ff1c0c29af1.arnhem.nl.,1 2123,arnhem.nl.,www.arnhem.nl.,1 2123,arnhem.nl.,www.arnhem.nl.,1 2123,arnhem.nl.,www.arnhem.nl.,1 2123,arnhem.nl.,www.arnhem.nl.,1 2123,arnhem.nl.,www.arnhem.nl.,1 2123,arnhem.nl.,www.arnhem.nl.,1 2124,arnhem.nl.,www.h.arnhem.nl.,1 2124,arnhem.nl.,www.ris.arnhem.nl.,1 2124,arnhem.nl.,a.b.c.d.e.f.h.arnhem.nl.,1 2124,arnhem.nl.,a.bb.ccc.dddd.eeeee.ffffff.h.arnhem.nl.,1 2124,arnhem.nl.,a.*.ccc.dddd.eeeee.ffffff.h.arnhem.nl.,1 325,arnhem.nl.,14809963d1b7.arnhem.nl.,1 333,arnhem.nl,www.myris.zeewolde.nl.,1 334,zeewolde.nl,www.myris.arnhem.nl.,1 335,arnhem.nl.,arnhem.nl.,1 """ assert SIDNUpload.objects.all().count() == 0 user = User() user.first_name = "" user.save() sidn_domain_upload(user.pk, csv_data) assert SIDNUpload.objects.all().count() == 1 # Create a domain upload for arnhem: sidn_handle_domain_upload(1) first_upload = SIDNUpload.objects.first() assert first_upload.state == "done" assert sorted(first_upload.newly_added_domains) == sorted([ "01.arnhem.nl", "01daf671c183434584727ff1c0c29af1.arnhem.nl", "www.h.arnhem.nl", "www.arnhem.nl", "14809963d1b7.arnhem.nl", "www.ris.arnhem.nl", "a.b.c.d.e.f.h.arnhem.nl", "a.bb.ccc.dddd.eeeee.ffffff.h.arnhem.nl", "www.myris.arnhem.nl", ]) assert first_upload.amount_of_newly_added_domains == 9