class Spider(object): start_urls = [''] def __init__(self): self.count = 0 # 用于记录xls的行数 self.downloader = Downloader() self.next_url = None self.mutex = Lock() # 创建一个锁 用于文件的读写 def __enter__(self): # 创建 xls 文件对象 self.wb = xlwt.Workbook() # 新增一个表单 self.sheet = self.wb.add_sheet('Sheet 1') self.start_request() def __exit__(self, exc_type, exc_val, exc_tb): self.wb.save('data.xls') def start_request(self): for url in self.start_urls: self.downloader.add_job(priority_number=1, job=url, handle=self.parse) self.downloader.create_threads() def parse(self, response): pass
def main(cache_dir): okapi = Okapi(OC_OKAPI_KEY, user_agent=USER_AGENT) os.makedirs(cache_dir, exist_ok=True) os.makedirs(os.path.join(cache_dir, "json"), exist_ok=True) os.makedirs(os.path.join(cache_dir, "orig"), exist_ok=True) os.makedirs(os.path.join(cache_dir, "small"), exist_ok=True) os.makedirs(os.path.join(cache_dir, "big"), exist_ok=True) file_name = os.path.join(cache_dir, "json", "caches.json") if os.path.isfile(file_name): json_data = load_json(file_name) else: print("-- downloading query...") oc_codes = download_query(OC_USERNAME, OC_PASSWORD, OC_QUERYID) try: with open(MANUAL_CACHES_FILE, "r") as f: for oc_code in f: oc_code = oc_code.strip() if oc_code.startswith("OC"): print("-- adding manual code {}".format(oc_code)) oc_codes.append(oc_code) except IOError: pass print("-> codes: {}".format(len(oc_codes))) fields = [ 'code', 'name', 'location', 'status', 'url', 'owner', 'founds', 'date_hidden', 'date_created', 'short_description', 'description', 'images', 'preview_image', 'internal_id' ] json_data = okapi.get_caches(oc_codes, fields) store_json(file_name, json_data) print("-- analyzing cache data...") caches = load_caches(json_data) caches = sorted(caches, key=lambda c: c._date, reverse=True) print("-> caches: {}".format(len(caches))) print("-- analyzing log data...") total_logs = 0 logs_without_coords = 0 for cache in caches: file_name = os.path.join(cache_dir, "json", f"{cache._code}-logs.json") if os.path.isfile(file_name): json_data = load_json(file_name) else: fields = [ 'uuid', 'date', 'user', 'type', 'comment', 'images', 'internal_id' ] json_data = okapi.get_logs(cache._code, fields) store_json(file_name, json_data) cache._logs = load_logs(json_data) for log in cache._logs: total_logs += 1 if log._coordinates is None: logs_without_coords += 1 print("-- logs without coordinates: {}/{}".format(logs_without_coords, total_logs)) print("-- downloading missing images...") downloader = Downloader(threads=4, user_agent=USER_AGENT) thumbnailer = Thumbnailer(threads=4) for cache in caches: if cache._preview_image is not None: extension = 'noext' m = re.match('^.*\.([^.\?]+)(\?.*)?$', cache._preview_image) if m: extension = m.group(1) raw_image = '{}/{}/{}.{}'.format(cache_dir, "orig", cache._code, extension) downloader.add_job(cache._preview_image, raw_image) thumb_small = '{}/{}/{}.jpg'.format(cache_dir, "small", cache._code) thumbnailer.add_job(raw_image, thumb_small, SIZE_SMALL) thumb_big = '{}/{}/{}.jpg'.format(cache_dir, "big", cache._code) thumbnailer.add_job(raw_image, thumb_big, SIZE_BIG) downloader.run() print("-- scaling images...") thumbnailer.run() print("-- creating files...") create_db(caches, os.path.join(cache_dir, "safari.sqlite")) collect_logs(caches, os.path.join(cache_dir, "log-data.js")) createlist(caches, 30, cache_dir) create_feed(caches, os.path.join(cache_dir, "feed.xml")) create_sidebar(caches, "static/index.html", os.path.join(cache_dir, "index.html"), cache_dir)
def main(): okapi = Okapi(OC_OKAPI_KEY, user_agent=USER_AGENT) mkdir(CACHE_DIR) mkdir("{}/json".format(CACHE_DIR)) mkdir("{}/orig".format(CACHE_DIR)) mkdir("{}/small".format(CACHE_DIR)) mkdir("{}/big".format(CACHE_DIR)) file_name = "{}/json/caches.json".format(CACHE_DIR) if os.path.isfile(file_name): json_data = load_json(file_name) else: print("-- downloading query...") oc_codes = download_query(OC_USERNAME, OC_PASSWORD, OC_QUERYID) try: with open(MANUAL_CACHES_FILE, "r") as f: for oc_code in f: oc_code = oc_code.strip() if oc_code.startswith("OC"): print("-- adding manual code {}".format(oc_code)) oc_codes.append(oc_code) except IOError: pass print("-> codes: {}".format(len(oc_codes))) fields = ['code', 'name', 'location', 'status', 'url', 'owner', 'founds', 'date_hidden', 'date_created', 'short_description', 'description', 'images', 'preview_image', 'internal_id'] json_data = okapi.get_caches(oc_codes, fields) store_json(file_name, json_data) print("-- analyzing cache data...") caches = load_caches(json_data) caches = sorted(caches, key=lambda c: c._date, reverse=True) print("-> caches: {}".format(len(caches))) print("-- analyzing log data...") total_logs = 0 logs_without_coords = 0 for cache in caches: file_name = "{}/json/{}-logs.json".format(CACHE_DIR, cache._code) if os.path.isfile(file_name): json_data = load_json(file_name) else: fields = ['uuid', 'date', 'user', 'type', 'comment', 'images', 'internal_id'] json_data = okapi.get_logs(cache._code, fields) store_json(file_name, json_data) cache._logs = load_logs(json_data) for log in cache._logs: total_logs += 1 if log._coordinates is None: logs_without_coords += 1 print("-- logs without coordinates: {}/{}".format(logs_without_coords, total_logs)) print("-- downloading missing images...") downloader = Downloader(threads=4, user_agent=USER_AGENT) thumbnailer = Thumbnailer(threads=4) for cache in caches: if cache._preview_image is not None: extension = 'noext' m = re.match('^.*\.([^.\?]+)(\?.*)?$', cache._preview_image) if m: extension = m.group(1) raw_image = '{}/{}/{}.{}'.format(CACHE_DIR, "orig", cache._code, extension) downloader.add_job(cache._preview_image, raw_image) thumb_small = '{}/{}/{}.jpg'.format(CACHE_DIR, "small", cache._code) thumbnailer.add_job(raw_image, thumb_small, SIZE_SMALL) thumb_big = '{}/{}/{}.jpg'.format(CACHE_DIR, "big", cache._code) thumbnailer.add_job(raw_image, thumb_big, SIZE_BIG) downloader.run() print("-- scaling images...") thumbnailer.run() print("-- creating files...") create_db(caches, ".cache/safari.sqlite") collect_logs(caches, ".cache/log-data.js") createlist(caches, 30) create_feed(caches, ".cache/feed.xml") create_sidebar(caches, "static/index.html", ".cache/index.html")
class Aggregator: def __init__( self, podcasts_json: str, cache_dir: str, export_dir: str, templates_dir: str, base_url: str, ) -> None: self._downloader = Downloader( 4, "Lauf Podcast Aggregator, https://lauf-podcasts.flopp.net/") self._imagescaler = ImageScaler(4) self._podcasts_json_file = podcasts_json self._cache_dir = cache_dir self._export_dir = export_dir self._base_url = base_url self._podcasts: List[Dict[str, Any]] = [] self._jinja = jinja2.Environment( loader=jinja2.FileSystemLoader(templates_dir), autoescape=True) self._jinja.filters["timestamp2date"] = self.format_date self._jinja.filters["timestamp2datetime"] = self.format_datetime self._jinja.filters["formatseconds"] = self.format_seconds self._jinja.globals["now"] = datetime.datetime.now().timestamp() self._jinja.globals["base_url"] = base_url def format_date(self, value: Optional[float]) -> str: if value is None: return "n/a" d = datetime.datetime.fromtimestamp(value) return d.strftime("%F") def format_datetime(self, value: Optional[float]) -> str: if value is None: return "n/a" d = datetime.datetime.fromtimestamp(value) return d.strftime("%F %T") def format_seconds(self, value: Optional[int]) -> str: if value is None: return "n/a" if value == 0: return "n/a" d = datetime.timedelta(seconds=value) return str(d) def clear_cache(self) -> None: if os.path.isdir(self._cache_dir): shutil.rmtree(self._cache_dir) def sync(self, keep_feeds: bool = False) -> None: with open(self._podcasts_json_file, "r") as f: self._podcasts = json.load(f) # filter podcasts without title or feed url self._podcasts = [ p for p in self._podcasts if ("title" in p) and ("feed" in p) ] for podcast in self._podcasts: title = podcast["title"] feed_url = podcast["feed"] sanitized_title = self.sanitize(title) podcast["sanitized_title"] = sanitized_title dir = "{}/{}".format(self._cache_dir, sanitized_title) podcast["raw_dir"] = dir feed_file = "{}/feed".format(dir) podcast["feed_file"] = feed_file added = datetime.datetime.strptime(podcast["added"], "%Y-%m-%d") if added + datetime.timedelta(weeks=2) >= datetime.datetime.now(): podcast["new"] = 1 os.makedirs(dir, exist_ok=True) self._downloader.add_job(feed_url, feed_file, force=(not keep_feeds)) self._downloader.run() # filter podcasts with non-existent feed file self._podcasts = [ p for p in self._podcasts if os.path.exists(p["feed_file"]) ] for podcast in self._podcasts: podcast["skip"] = False feed_url = podcast["feed"] feed_file = podcast["feed_file"] try: with open(feed_file, "r") as f: podcast["data"] = podcastparser_parse(feed_url, f) except Exception: podcast["skip"] = True continue # determine latest publish date last_publish = None for episode in podcast["data"]["episodes"]: if not last_publish or episode["published"] > last_publish: last_publish = episode["published"] podcast["last_publish"] = last_publish if last_publish is None: print("no last publish date: {}".format(podcast["title"])) # format descriptions podcast["data"]["description_formatted"] = self.format_description( podcast["data"]["description"]) for episode in podcast["data"]["episodes"]: if "description_html" in episode: episode["description_formatted"] = self.clean_html( episode["description_html"]) else: episode["description_formatted"] = self.format_description( episode["description"]) # determine cover image cover_url = None if "cover_url" in podcast: cover_url = podcast["cover_url"] if not cover_url and ("cover_url" in podcast["data"]): cover_url = podcast["data"]["cover_url"] if cover_url: dir = "{}/{}".format(self._cache_dir, podcast["sanitized_title"]) cover_file = "{}/cover".format(dir) self._downloader.add_job(cover_url, cover_file) self._downloader.run() # filter podcasts with skip attribute self._podcasts = [p for p in self._podcasts if not p["skip"]] # sort by reversed 'last_publish' timestamp self._podcasts.sort(key=lambda x: -x["last_publish"] if x["last_publish"] is not None else 0) def clean_html(self, html: str) -> str: soup = BeautifulSoup(html, features="html.parser") for tag in soup(): for attribute in ["class", "id", "name", "style"]: del tag[attribute] for tag_name in [ "audio", "figure", "iframe", "img", "script", "video" ]: for tag in soup(tag_name): tag.decompose() return str(soup) def format_description(self, description: str) -> str: re_newline = re.compile(r"(\n)") re_divider = re.compile(r"((?:---+)|(?:\*\*\*+)|(?:\+\+\++))") re_link = re.compile(r"(https?://[A-Za-z0-9/.=\?&%_\-]+)") s = description s = re_newline.sub(r"<br />", s) s = re_divider.sub(r"<br />\1<br />", s) s = re_link.sub(r'<a href="\1" rel="nofollow" target="_blank">\1</a>', s) return s def export(self) -> None: for podcast in self._podcasts: self._imagescaler.add_job( "{}/{}/cover".format(self._cache_dir, podcast["sanitized_title"]), "{}/{}/cover.jpg".format(self._export_dir, podcast["sanitized_title"]), 512, ) self._imagescaler.run() self.export_info() self.export_impressum() self.export_index() self.export_sitemap() for podcast in self._podcasts: self.export_podcast(podcast) self._imagescaler.run() def export_info(self) -> None: os.makedirs(self._export_dir, exist_ok=True) with open("{}/info.html".format(self._export_dir), "w") as f: template = self._jinja.get_template("info.html") f.write(template.render()) def export_impressum(self) -> None: os.makedirs(self._export_dir, exist_ok=True) with open("{}/impressum.html".format(self._export_dir), "w") as f: template = self._jinja.get_template("impressum.html") f.write(template.render()) def export_index(self) -> None: os.makedirs(self._export_dir, exist_ok=True) with open("{}/index.html".format(self._export_dir), "w") as f: template = self._jinja.get_template("index.html") f.write(template.render(podcasts=self._podcasts)) def export_sitemap(self) -> None: os.makedirs(self._export_dir, exist_ok=True) with open("{}/sitemap.xml".format(self._export_dir), "w") as f: template = self._jinja.get_template("sitemap.xml") f.write(template.render(podcasts=self._podcasts)) def export_podcast(self, podcast: Dict[str, Any]) -> None: dir = "{}/{}".format(self._export_dir, podcast["sanitized_title"]) os.makedirs(dir, exist_ok=True) with open("{}/index.html".format(dir), "w") as f: template = self._jinja.get_template("podcast.html") f.write( template.render(podcast=podcast, episodes=podcast["data"]["episodes"])) def sanitize(self, s: str) -> str: return re.sub(r"\W+", "-", s).lower()