def build_privacy_policy(data): with open('_site/privacy-policy.html', 'w') as output: output.write(render_template( template=get_template(data, "privacy-policy.html"), )) print_progress(text="Generate Privacy Policy")
def generate_sitemap(blog_posts): data = DataSource(populate=False) # write sitemap to _site (to be used as index for static site search) with open("_site/sitemap.json", "w") as output: json.dump(site_to_json(data_source=data, blog_posts=blog_posts), output) print_progress(text='Generate sitemap index')
def build_imprint(data): with open('_site/imprint.html', 'w') as output: output.write(render_template( template=get_template(data, "imprint.html"), )) print_progress(text="Generate Imprint")
def build_blogpost_list(data, blog_posts): with open('_site/blog.html', 'w') as output: output.write( render_template(template=get_template(data, "blog.html"), blog_posts=[p for p in blog_posts if p['publish']])) print_progress(text="Generate blog list")
def build_tracker_pages(data): template = get_template(data, name='tracker-page.html', path_to_root='..') for (tracker_id, tracker) in data.trackers.iter(): tracker_page(template, tracker_id, tracker, data) print_progress(text="Generate tracker pages")
def build_explorer(): data = DataSource(populate=False) build_packed_data(data) temp_folder = Path("temp") if not temp_folder.exists(): temp_folder.mkdir() table_to_csv(data.trackers, "temp/trackers.csv") table_to_csv(data.sites, "temp/sites.csv") table_to_csv(data.companies, "temp/companies.csv") table_to_csv(data.sites_trackers, "temp/sites_trackers.csv") month = data.trackers.last_month shutil.make_archive( f"_site/data/wtm-data-{month}", "zip", "temp" ) shutil.rmtree(temp_folder.as_posix(), ignore_errors=True) with open(f"_site/explorer.html", "w") as output: output.write(render_template( template=get_template(data, name="explorer.html"), download_link=f"data/wtm-data-{month}.zip" )) print_progress(text="Generated Exporable Dataset")
def build_home(data): apps = data.apps sorted_trackers = sorted(apps.values(), key=lambda a: a['overview']['reach'], reverse=True) sorted_trackers_cat = sorted(apps.values(), key=lambda a: a.get('cat', '') or '') for tracker in sorted_trackers: if 'name' not in tracker: tracker['name'] = tracker['overview']['id'] for tracker in sorted_trackers_cat: if 'name' not in tracker: tracker['name'] = tracker['overview']['id'] # most tracked sites by cat most_tracked_sites = tracked_by_category(data.sites, worst=True) # least tracked sites by cat least_tracked_sites = tracked_by_category(data.sites, worst=False) top10 = company_reach(data.companies) header_graph = Markup(overview_bars(top10)) with open('_site/index.html', 'w') as output: output.write( render_template(template=get_template(data, "index.html"), ts=header_graph, tracker_list=sorted_trackers[:20], trackers_list_cat=sorted_trackers_cat[:20], most_tracked_sites=most_tracked_sites, least_tracked_sites=least_tracked_sites)) print_progress(text="Generate home page")
def build_website_pages(data): template = get_template(data, "website-page.html", path_to_root='..') for (rank, site) in enumerate(data.sites.sort_by(metric='popularity', descending=True)): website_page(template, site, rank + 1, data) print_progress(text="Generate website pages")
def build_tracker_pages(data): apps = data.apps template = get_template(data, name='tracker-page.html', path_to_root='..') for (aid, app) in apps.items(): tracker_page(template, aid, app, data) print_progress(text="Generate tracker pages")
def build_company_pages(data): companies = data.companies template = get_template(data, "company-page.html") for company_data in companies.values(): company_page(template, company_data, data) print_progress(text="Generate company pages")
def build_trackers_list(data): with open('_site/trackers.html', 'w') as output: output.write( render_template(template=get_template(data, name="trackers.html"), tracker_list=data.trackers.sort_by(metric="reach"), trackers_list_company=data.trackers.sort_by( metric="company_id", descending=False), header_stats=data.trackers.summary_stats())) print_progress(text="Generate tracker list")
def build_api(data): # tracker overviews data_dir = Path('_site/data/trackers/global') if not data_dir.exists(): data_dir.mkdir(parents=True) for id, stats in data.trackers.iter(): build_tracker_json(id, data) print_progress(text='Generate API data')
def build_website_pages(data): sites = data.sites template = get_template(data, "website-page.html", path_to_root='..') for rank, (site_id, site) in enumerate( sorted(sites.items(), key=lambda s: s[1]['overview']['popularity'], reverse=True)): website_page(template, site_id, rank + 1, data) print_progress(text="Generate website pages")
def build_company_reach_chart_page(data): top100 = company_reach(data.companies, n=100) chart = Markup(overview_bars(top100, highlight=10, custom_height=3000)) template = get_template(data, name='reach-chart-page.html', path_to_root='..') with open('_site/companies/reach-chart.html', 'w') as output: output.write(render_template( path_to_root='..', template=template, chart=chart, )) print_progress(text="Generate company reach chart")
def build_website_list(data): header_numbers = data.sites.summary_stats() sorted_websites = data.sites.sort_by(metric='popularity', descending=True) sorted_websites_cat = data.sites.sort_by(metric='category', descending=True) with open('_site/websites.html', 'w') as output: output.write(render_template( template=get_template(data, "websites.html"), website_list=sorted_websites, website_list_cat=sorted_websites_cat, header_numbers=header_numbers )) print_progress(text="Generate website list")
def build_blogpost_pages(data, blog_posts): template = get_template(data, "blog-page.html", render_markdown=True, path_to_root='..') for blog_post in blog_posts: with open(f'_site/blog/{blog_post.get("filename")}.html', 'w') as output: output.write( render_template(path_to_root='..', template=template, blog_post=blog_post)) print_progress(text="Generate blog posts")
def build_packed_data(data): data_dir = Path("_site/data/packed/") if not data_dir.exists(): data_dir.mkdir(parents=True) for data_source in ["trackers", "companies", "sites", "sites_trackers"]: with open(f"_site/data/packed/{data_source}.pack", "wb") as output: output.write(b"".join( pack_rows( fields=FIELDS, rows=getattr(data, data_source).get_snapshot().itertuples(), ))) print_progress(text="Generate packed data")
def build_blogpost_pages(data, blog_posts): for blog_post in blog_posts: #TODO: Move template out after footnotes markdown extension does # not save global state template = get_template(data, "blog-page.html", render_markdown=True, path_to_root='..') with open(f'_site/blog/{blog_post.get("filename")}.html', 'w') as output: output.write( render_template(path_to_root='..', template=template, blog_post=blog_post)) print_progress(text="Generate blog posts")
def batched_job(inp, batch_fn, batch_size, message): batches = [] input_size = len(inp) for batch in [ inp[i:i + batch_size] for i in range(0, input_size, batch_size) ]: submission = executor.submit(batch_fn, batch=batch) batches.append(submission) futures.append(submission) for i, f in enumerate( concurrent.futures.as_completed(batches)): print_progress( text= f"{message} {min((i+1) * batch_size, input_size)}/{input_size}" ) return batches
def build_api(data): # tracker overviews data_dir = Path('_site/data/trackers/global') if not data_dir.exists(): data_dir.mkdir(parents=True) for id, stats in data.trackers.iter(): stats = data.trackers.get_tracker(id) stats['overview'] = dict(stats['overview']) # drop some columns for col in ['Index', 'companies', 'month', 'trackers', 'tracker', 'id', 'company_id', 'category', 'country']: del stats['overview'][col] stats['date_range'] = [date.strftime('%Y-%m') for date in stats['date_range']] # print(stats) with open(f'_site/data/trackers/global/{id}.json', 'w') as output: json.dump(stats, output) print_progress(text='Generate API data')
def build_home(data): top10 = company_reach(data.companies) header_graph = Markup(overview_bars(top10)) with open('_site/index.html', 'w') as output: output.write( render_template( template=get_template(data, "index.html"), ts=header_graph, tracker_list=data.trackers.sort_by(metric="reach")[:20], trackers_list_company=data.trackers.sort_by( metric="company_id")[:20], most_tracked_sites=data.sites.sort_by(metric='trackers')[:20], least_tracked_sites=data.sites.sort_by(metric='trackers', descending=False)[:20], websites=data.sites.summary_stats(), tracker_stats=data.trackers.summary_stats(), top10=top10)) print_progress(text="Generate home page")
def build_explorer(data): build_packed_data(data) temp_folder = Path("temp") if not temp_folder.exists(): temp_folder.mkdir() data.trackers.df.to_csv("temp/trackers.csv") data.sites.df.to_csv("temp/sites.csv") data.companies.df.to_csv("temp/companies.csv") data.sites_trackers.df.to_csv("temp/sites_trackers.csv") month = datetime.strftime(max(data.trackers.df.month), '%Y-%m') shutil.make_archive(f"_site/data/wtm-data-{month}", "zip", "temp") shutil.rmtree(temp_folder.as_posix(), ignore_errors=True) with open(f"_site/explorer.html", "w") as output: output.write( render_template(template=get_template(data, name="explorer.html"), download_link=f"data/wtm-data-{month}.zip")) print_progress(text="Generated Exporable Dataset")
def build_trackers_list(data): apps = data.apps sorted_trackers = sorted(apps.values(), key=lambda a: a['overview']['reach'], reverse=True) sorted_trackers_cat = sorted( apps.values(), key=lambda a: data.get_app_name(a['overview']['id']) if ('company_id' not in a or a['company_id'] in [None, "None"]) else a[ 'company_id']) for tracker in sorted_trackers: if 'name' not in tracker: tracker['name'] = tracker['overview']['id'] with open('_site/trackers.html', 'w') as output: output.write( render_template(template=get_template(data, name="trackers.html"), tracker_list=sorted_trackers, trackers_list_cat=sorted_trackers_cat, header_stats=tracker_header_stats(data.apps))) print_progress(text="Generate tracker list")
def build_website_list(data): sites = data.sites tracker_requests, tracker_buckets, https = summary_stats(data.sites) # header stats tracker_values = [] tracker_labels = [] for (k, v) in tracker_buckets.items(): tracker_values.append(v) tracker_labels.append(k) header_numbers = header_stats(data.sites) sorted_websites = sort_by_rank(data.sites) sorted_websites_cat = sort_by_cat(data.sites) # write to file with open('_site/websites.html', 'w') as output: output.write( render_template(template=get_template(data, "websites.html"), website_list=sorted_websites, website_list_cat=sorted_websites_cat, header_numbers=header_numbers)) print_progress(text="Generate website list")
def feed_event(self, event): futures = [] with concurrent.futures.ThreadPoolExecutor() as executor: ################################################################### # This needs to be first, as other tasks will need to write in # # the resulting folders. # ################################################################### # Depends on folder: 'static/' if event & STATIC_FOLDER: create_site_structure(static_path=STATIC_PATH) print_progress(text='Create _site') ################################################################### # We then reload data in memory, before generating the site # ################################################################### # Depends on folder: 'data/' if self.data_source is None or event & DATA_FOLDER: # class where all data can be accessed from data_source = DataSource() print_progress(text='Load data sources') # Depends on: 'blog/' if self.blog_posts is None or event & BLOG_FOLDER: self.blog_posts = load_blog_posts() print_progress(text='Load blog posts') ################################################################### # Once site structure has been created and data is refreshed, we # # can build all parts of the site in parallel, since there is no # # dependencies between them. # ################################################################### # Depends on: 'templates/', 'data/' if event & DATA_FOLDER or event & TEMPLATES_FOLDER: print_progress(text='Generate error pages') copy_custom_error_pages(data=data_source) # Depends on: 'data/', 'templates/' if event & DATA_FOLDER or event & TEMPLATES_FOLDER: # Home futures.append(executor.submit(build_home, data=data_source)) # Trackers futures.append( executor.submit(build_trackers_list, data=data_source)) futures.append( executor.submit(build_tracker_pages, data=data_source)) # Websites futures.append( executor.submit(build_website_list, data=data_source)) futures.append( executor.submit(build_website_pages, data=data_source)) # Depends on: 'data/', 'blog/', 'templates/' if event & DATA_FOLDER or event & BLOG_FOLDER or event & TEMPLATES_FOLDER: futures.append( executor.submit(build_blogpost_list, data=data_source, blog_posts=self.blog_posts)) futures.append( executor.submit(build_blogpost_pages, data=data_source, blog_posts=self.blog_posts)) # Depends on: 'data/', 'blog/', 'templates/' if event & DATA_FOLDER or event & BLOG_FOLDER or event & TEMPLATES_FOLDER: futures.append( executor.submit(generate_sitemap, data=data_source, blog_posts=self.blog_posts)) # TODO: uncomment when company profiles are ready # if args['site'] or args['companies']: # company_process = Process(target=build_company_pages, args=(data_source,)) # company_process.start() # Wait for all jobs to finish concurrent.futures.wait(futures) # Getting the `result` of each promise (although none is expected) # allows to re-raise exception happening in children processes. If # we don't do it, exceptions will be silently ignored. for future in futures: future.result() print('Done')
def feed_event(self, event): futures = [] with concurrent.futures.ProcessPoolExecutor(max_workers=8) as executor: ################################################################### # This needs to be first, as other tasks will need to write in # # the resulting folders. # ################################################################### # Depends on folder: 'static/' if event & STATIC_FOLDER: create_site_structure(static_path=STATIC_PATH) print_progress(text='Create _site') ################################################################### # We then reload data in memory, before generating the site # ################################################################### # Depends on folder: 'data/' if self.data_source is None or event & DATA_FOLDER: # class where all data can be accessed from data_source = DataSource() print_progress(text='Load data sources') # Depends on: 'blog/' if self.blog_posts is None or event & BLOG_FOLDER: self.blog_posts = load_blog_posts() print_progress(text='Load blog posts') ################################################################### # Once site structure has been created and data is refreshed, we # # can build all parts of the site in parallel, since there is no # # dependencies between them. # ################################################################### # Depends on: 'templates/', 'data/' if event & DATA_FOLDER or event & TEMPLATES_FOLDER: print_progress(text='Generate error pages') copy_custom_error_pages(data=data_source) def batched_job(inp, batch_fn, batch_size, message): batches = [] input_size = len(inp) for batch in [ inp[i:i + batch_size] for i in range(0, input_size, batch_size) ]: submission = executor.submit(batch_fn, batch=batch) batches.append(submission) futures.append(submission) for i, f in enumerate( concurrent.futures.as_completed(batches)): print_progress( text= f"{message} {min((i+1) * batch_size, input_size)}/{input_size}" ) return batches # Explorer: depends on 'data/' if event & DATA_FOLDER or event & STATIC_FOLDER: futures.append(executor.submit(build_explorer, )) # Depends on: 'data/', 'blog/', 'templates/' if event & DATA_FOLDER or event & BLOG_FOLDER or event & TEMPLATES_FOLDER: futures.append( executor.submit(generate_sitemap, blog_posts=self.blog_posts)) # Depends on: 'data/', 'templates/' if event & DATA_FOLDER or event & TEMPLATES_FOLDER: # Home build_home(data=data_source) build_privacy_policy(data=data_source) # Trackers trackers = [id for id, _ in data_source.trackers.iter()] batched_job(trackers, build_tracker_page_batch, 150, "Generate tracker pages") build_trackers_list(data=data_source) # Websites websites = list( enumerate([id for id, _ in data_source.sites.iter()])) batched_job(websites, build_website_pages_batch, 400, "Generate website pages") build_website_list(data=data_source) # Companies build_company_reach_chart_page(data=data_source) # Depends on: 'data/', 'blog/', 'templates/' if event & DATA_FOLDER or event & BLOG_FOLDER or event & TEMPLATES_FOLDER: futures.append( executor.submit(build_blogpost_pages, blog_posts=self.blog_posts)) futures.append( executor.submit(build_rss_feeds, blog_posts=self.blog_posts)) build_blogpost_list(data=data_source, blog_posts=self.blog_posts) if event & DATA_FOLDER: build_tracker_db() trackers = [id for id, _ in data_source.trackers.iter()] data_dir = Path('_site/data/trackers/global') if not data_dir.exists(): data_dir.mkdir(parents=True) batched_job(trackers, build_tracker_api_batch, 150, "Generate Tracker API pages") site_data_dir = Path('_site/data/sites/global') if not site_data_dir.exists(): site_data_dir.mkdir(parents=True) sites = [id for id, _ in data_source.sites.iter()] batched_job(sites, build_website_api_batch, 400, "Generate Website API pages") # TODO: uncomment when company profiles are ready # if args['site'] or args['companies']: # company_process = Process(target=build_company_pages, args=(data_source,)) # company_process.start() # Wait for all jobs to finish concurrent.futures.wait(futures) # Getting the `result` of each promise (although none is expected) # allows to re-raise exception happening in children processes. If # we don't do it, exceptions will be silently ignored. for future in futures: future.result() print('Done')
def build_tracker_db(): with open('_site/data/trackerdb.json', 'w') as output: db_map = create_tracker_map(load_tracker_db(), with_iab_vendors=True) db_map['about'] = 'WhoTracks.Me tracker database: whotracks.me' json.dump(db_map, output, indent=2, sort_keys=True) print_progress(text='Generate tracker DB')