def extract_and_save(args): file_name, file_content = args url = file_content["url"] html = file_content["html"] parsed = parse_article(html, url) just.write(parsed, "~/.nostalgia_chrome/metadata/" + file_name.split("/")[-1])
def get_linked_data(x): path = x["path"] if path in CACHE: return CACHE[path] try: html = just.read(path) except EOFError: CACHE[path] = None return None if not html.strip(): CACHE[path] = None return None art = parse_article(html, x["url"]) linked_data = get_linked_data_md(art) if linked_data is None: linked_data = get_linked_data_jd(art) CACHE[path] = linked_data return linked_data
def get_linked_data(x): path = x["path"] if path in CACHE: return CACHE[path] try: html = just.read(path) except EOFError: CACHE[path] = None return None if not html.strip(): CACHE[path] = None return None art = parse_article(html, x["url"]) if "youtube" not in art.domain: return None title = re.sub(" - YouTube$", "", art.tree.xpath("//title/text()")[0]) if title == "YouTube": CACHE[path] = None return None if not title: return None vc = art.tree.xpath("//span[contains(@class, 'view-count')]/text()") vc = re.sub("[^0-9]", "", vc[0]) if vc else None watch_part = urllib.parse.parse_qs(urllib.parse.urlparse( x["url"]).query)["v"] if watch_part: image = "http://i3.ytimg.com/vi/{}/maxresdefault.jpg".format( watch_part[0]) else: image = None channel = art.tree.xpath("//ytd-video-owner-renderer//a/text()") if not channel: channel = art.tree.xpath("//ytd-channel-name//a/text()") channel = " ".join(channel) linked_data = { "title": title, "type": "video", "source": "youtube", "image": image, "view_count": vc, "channel": channel, } CACHE[path] = linked_data return linked_data
import gzip import os import just from auto_extract import parse_article import tqdm from urllib.parse import urlparse import tldextract from utils import KEYS_TO_KEEP for x in tqdm.tqdm(just.glob("/home/pascal/.nostalgia/meta/v1/*.json")): print("processing", x) meta = just.read(x) if "extruct" in meta: print("skipping", x) continue html_path = "/home/pascal/.nostalgia/html/" + x.split("/")[-1].rstrip( ".json") + ".html.gz" if os.path.exists(html_path): with gzip.GzipFile(html_path, "r") as f: html = f.read() article = parse_article(html, meta["url"]) meta = article.to_dict(keys=KEYS_TO_KEEP, skip_if_empty=True) just.write(meta, x) os.system("touch '{}' -r '{}'".format(x, html_path)) print("done", x)
def slug_url(url): pre_slug = re.sub(r"[-\s]+", "-", url) slugged_url = re.sub(r"[^\w\s-]", "", pre_slug).strip().lower()[-150:] return slugged_url for x in tqdm.tqdm( just.glob("/home/pascal/.nostalgia_chrome/old/html/*.json")): ctime = os.path.getctime(x) with open(x) as f: print("processing", x) data = json.load(f) html = data["html"] url = data["url"] slugged_url = slug_url(url) article = parse_article(html, url) meta = article.to_dict(keys=KEYS_TO_KEEP, skip_if_empty=True) meta["creation_time"] = ctime meta["slugged_url"] = slugged_url html_path = "/home/pascal/.nostalgia_chrome/html/{}_{}.html.gz".format( ctime, slugged_url) with gzip.GzipFile(html_path, "w") as f: f.write(html.encode("utf8")) meta_path = "/home/pascal/.nostalgia_chrome/meta/v1/{}_{}.json".format( ctime, slugged_url) just.write(meta, meta_path) os.system("touch '{}' -r '{}'".format(html_path, x)) os.system("touch '{}' -r '{}'".format(meta_path, x)) just.remove(x)