예제 #1
0
def html(in_file, out_file, skip_none, only_new):
    """Write an HTML report."""
    with in_file:
        sites = pickle.load(in_file)

    if skip_none:
        sites = [site for site in sites if site.current_courses is not None]

    # Prep data for reporting.
    old, new = totals(sites)
    all_courses, all_orgs, all_course_ids = courses_and_orgs(sites)

    with open("course-ids.txt", "w") as f:
        f.write("".join(i + "\n" for i in sorted(all_course_ids)))

    known_domains = {
        domain_from_url(site.url)
        for site in read_sites_csv(SITES_CSV)
    }
    with open(ALIASES_TXT) as aliases:
        known_domains.update(domain_from_url(line.strip()) for line in aliases)

    sites = sorted(sites, key=lambda s: s.url.split(".")[::-1])
    sites = sorted(sites,
                   key=lambda s: s.current_courses or s.latest_courses,
                   reverse=True)
    html_report(out_file,
                sites,
                old,
                new,
                all_courses,
                all_orgs,
                known_domains=known_domains,
                only_new=only_new)
예제 #2
0
def non_sub_urls(urls):
    """Return urls that are not subdomains of other urls."""
    domain_parts = [domain_from_url(u).split(".") for u in urls]
    def is_prefix(dp1, dp2):
        return dp1 != dp2 and dp2[len(dp2)-len(dp1):] == dp1
    non_sub_doms = [".".join(d) for d in domain_parts if not any(is_prefix(d2, d) for d2 in domain_parts)]
    non_subs = [u for u in urls if domain_from_url(u) in non_sub_doms]
    return non_subs
예제 #3
0
def get_known_domains():
    known_domains = {
        domain_from_url(site.url)
        for site in read_sites_csv(SITES_CSV)
    }
    with open(ALIASES_TXT) as aliases:
        known_domains.update(domain_from_url(line.strip()) for line in aliases)
    return known_domains
예제 #4
0
파일: sites.py 프로젝트: edx/openedx-census
 def best_url(self):
     site_urls = [site.url for site in self.sites]
     non_chaff = [
         url for url in site_urls
         if not is_chaff_domain(domain_from_url(url))
     ]
     urls = non_chaff or site_urls
     urls = non_sub_urls(urls)
     return urls[0]
예제 #5
0
def write_site(site, writer, known_domains):
    old, new = site.latest_courses, site.current_courses
    tags = Tags()

    new_text = ""
    if new is None:
        tags.add("None")
    else:
        if new != old:
            new_text = f"<b> &rarr; {new}</b>"
        if old != 0 and new != 0 and abs(new - old) > 10 and not (
                0.5 >= old / new >= 1.5):
            tags.add("Drastic")
    if site.is_gone_now:
        tags.add("Gone")
    elif site.is_gone:
        tags.add("Back")
    if is_chaff_domain(domain_from_url(site.url)):
        tags.add("Chaff")
    elif not is_known(site, known_domains):
        tags.add("New")
    if site.ssl_err:
        tags.add("SSL")
    if site.custom_parser_err:
        tags.add("Custom parser error", "bad")
    if site.version:
        tags.add(site.version, "version")
    # Times are not right now that we limit requests, not sites.
    #if site.time > 5:
    #    tags.add(f"{site.time:.1f}s", "slow")
    for tag in site.tags:
        tags.add(tag)

    writer.start_section(
        f"<a class='url' href='{site.url}'>{site.url}</a>: {old}{new_text} {tags.html()}"
    )
    for attempt in site.tried:
        strategy = attempt.strategy
        tb = attempt.error
        if tb is not None:
            lines = tb.splitlines()
            if len(lines) > 1:
                line = tb.splitlines()[-1][:100]
                writer.start_section(
                    f"<span class='strategy'>{strategy}:</span> {escape(line)}"
                )
                writer.write("""<pre class="stdout">""")
                writer.write(escape(tb))
                writer.write("""</pre>""")
                writer.end_section()
            else:
                writer.write(f"<p>{strategy}: {lines[0]}")
        else:
            writer.write(
                f"<p>{strategy}: counted {attempt.courses} courses</p>")
    writer.end_section()
예제 #6
0
def test_domain_from_url(domain, url):
    assert domain_from_url(domain) == url
예제 #7
0
 def all_chaff(self):
     return all(is_chaff_domain(domain_from_url(site.url)) for site in self.sites)