def html(in_file, out_file, skip_none, only_new): """Write an HTML report.""" with in_file: sites = pickle.load(in_file) if skip_none: sites = [site for site in sites if site.current_courses is not None] # Prep data for reporting. old, new = totals(sites) all_courses, all_orgs, all_course_ids = courses_and_orgs(sites) with open("course-ids.txt", "w") as f: f.write("".join(i + "\n" for i in sorted(all_course_ids))) known_domains = { domain_from_url(site.url) for site in read_sites_csv(SITES_CSV) } with open(ALIASES_TXT) as aliases: known_domains.update(domain_from_url(line.strip()) for line in aliases) sites = sorted(sites, key=lambda s: s.url.split(".")[::-1]) sites = sorted(sites, key=lambda s: s.current_courses or s.latest_courses, reverse=True) html_report(out_file, sites, old, new, all_courses, all_orgs, known_domains=known_domains, only_new=only_new)
def non_sub_urls(urls): """Return urls that are not subdomains of other urls.""" domain_parts = [domain_from_url(u).split(".") for u in urls] def is_prefix(dp1, dp2): return dp1 != dp2 and dp2[len(dp2)-len(dp1):] == dp1 non_sub_doms = [".".join(d) for d in domain_parts if not any(is_prefix(d2, d) for d2 in domain_parts)] non_subs = [u for u in urls if domain_from_url(u) in non_sub_doms] return non_subs
def get_known_domains(): known_domains = { domain_from_url(site.url) for site in read_sites_csv(SITES_CSV) } with open(ALIASES_TXT) as aliases: known_domains.update(domain_from_url(line.strip()) for line in aliases) return known_domains
def best_url(self): site_urls = [site.url for site in self.sites] non_chaff = [ url for url in site_urls if not is_chaff_domain(domain_from_url(url)) ] urls = non_chaff or site_urls urls = non_sub_urls(urls) return urls[0]
def write_site(site, writer, known_domains): old, new = site.latest_courses, site.current_courses tags = Tags() new_text = "" if new is None: tags.add("None") else: if new != old: new_text = f"<b> → {new}</b>" if old != 0 and new != 0 and abs(new - old) > 10 and not ( 0.5 >= old / new >= 1.5): tags.add("Drastic") if site.is_gone_now: tags.add("Gone") elif site.is_gone: tags.add("Back") if is_chaff_domain(domain_from_url(site.url)): tags.add("Chaff") elif not is_known(site, known_domains): tags.add("New") if site.ssl_err: tags.add("SSL") if site.custom_parser_err: tags.add("Custom parser error", "bad") if site.version: tags.add(site.version, "version") # Times are not right now that we limit requests, not sites. #if site.time > 5: # tags.add(f"{site.time:.1f}s", "slow") for tag in site.tags: tags.add(tag) writer.start_section( f"<a class='url' href='{site.url}'>{site.url}</a>: {old}{new_text} {tags.html()}" ) for attempt in site.tried: strategy = attempt.strategy tb = attempt.error if tb is not None: lines = tb.splitlines() if len(lines) > 1: line = tb.splitlines()[-1][:100] writer.start_section( f"<span class='strategy'>{strategy}:</span> {escape(line)}" ) writer.write("""<pre class="stdout">""") writer.write(escape(tb)) writer.write("""</pre>""") writer.end_section() else: writer.write(f"<p>{strategy}: {lines[0]}") else: writer.write( f"<p>{strategy}: counted {attempt.courses} courses</p>") writer.end_section()
def test_domain_from_url(domain, url): assert domain_from_url(domain) == url
def all_chaff(self): return all(is_chaff_domain(domain_from_url(site.url)) for site in self.sites)